In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import urllib.request # make request for get smiles of compounds from CFM

from rdkit import Chem, RDLogger, DataStructs 
from rdkit.Chem import Descriptors, AllChem, MACCSkeys, Draw
from rdkit.Chem.rdmolfiles import SmilesMolSupplier, ForwardSDMolSupplier
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import Image

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')



In [3]:
class Molecule:
    """
    class Molecule represents molecules
    
    Attributes
    ----------
    cid: PubChem Compound ID
    name : name of molecule
    smiles: SMILES
    database: database from which the molecule is taken
    mol : кeading molecule from SMILES

    Methods
    -------
    get_RDKFingerprint()
        return topological fingerprints of molecule
    get_MACCSKeys()
        return MACCS Keys fingerprints of molecule
    getMorganFingerprint()
        return Morgan fingerprints of molecule
    """
    
    def __init__(self, cid, name, smiles, database):
        self.cid = cid
        self.name = name
        self.smiles = smiles
        self.database = database
        self.mol = Chem.MolFromSmiles(self.smiles)

        
    def get_RDKFingerprint(self):
        return Chem.RDKFingerprint(self.mol)
    
    def get_MACCSKeys(self):
        return MACCSkeys.GenMACCSKeys(self.mol)
    
    def getMorganFingerprint(self):
        return AllChem.GetMorganFingerprintAsBitVect(self.mol, 2, nBits = 1024)

In [4]:
class Pair:
    """
    class Pair represents a pair of molecules
    
    Attributes
    ----------
    first_compound: the first molecule from the pair
    second_compound : the second molecule from the pair

    Methods
    -------
    get_simmilarity_using_RDKFingerprint()
        return Tanimoto similarity of molecules using topological fingerprints
    get_simmilarity_using_MACCSKeys_Fingerprint()
        return return Tanimoto similarity of molecules using MACCS Keys fingerprints
    get_simmilarity_using_Morgan_Fingerprint()
        return return Tanimoto similarity of molecules using Morgan fingerprints
    """
    def __init__(self, first_compound, second_compound):
        self.first_compound = first_compound
        self.second_compound = second_compound
        
    def get_simmilarity_using_RDKFingerprint(self):
        RDKFingerprint_first_compound = self.first_compound.get_RDKFingerprint()
        RDKFingerprint_second_compound = self.second_compound.get_RDKFingerprint()
        return DataStructs.FingerprintSimilarity(RDKFingerprint_first_compound, RDKFingerprint_second_compound)
    
    def get_simmilarity_using_MACCSKeys_Fingerprint(self):
        MACCSKeyFingerprint_first_compound = self.first_compound.get_MACCSKeys()
        MACCSKeyFingerprint_second_compound = self.second_compound.get_MACCSKeys()
        return DataStructs.FingerprintSimilarity(MACCSKeyFingerprint_first_compound, MACCSKeyFingerprint_second_compound)
    
    def get_simmilarity_using_Morgan_Fingerprint(self):
        MorganFingerprint_first_compound = self.first_compound.getMorganFingerprint()
        MorganFingerprint_second_compound = self.second_compound.getMorganFingerprint()
        return DataStructs.FingerprintSimilarity(MorganFingerprint_first_compound, MorganFingerprint_second_compound) 

In [5]:
def get_list_pairs_moleculs(list_molecules_1, list_molecules_2):
    """
    make pairs of molecules
    
    Parametrs
    ---------
    list_molecules_1: list with molecules from first database
    list_molecules_2: list with molecules from second database
    
    Return
    ------
    list with pairs of molecules (molecule from list_molecules_1, molecule from list_molecules_2), in that each pair belongs to the class Pair
    """
    
    list_of_pairs = []
    for i in range(len(list_molecules_1)):
        for j in range(len(list_molecules_2)):
            list_of_pairs.append(Pair(list_molecules_1[i], list_molecules_2[j]))
    return list_of_pairs

In [6]:
def select_pairs_by_treshold(list_of_pairs, type_fingerprints, treshold = 0.7):
    """
    select pairs of molecules whose similarity coefficient is greater than the threshold
    
    Parametrs
    ---------
    list_of_pairs: list with pairs of molecules
    type_fingerprints: type of molecular fingerprints
    treshold: set threshold, the default value is 0.7
    
    Return
    ------
    list with selected pairs of molecules, in that each pair belongs to the class Pair
    """
    
    selected_pairs = []
    if type_fingerprints== 'RDKFingerprint':
        for pair in list_of_pairs:
            if pair.get_simmilarity_using_RDKFingerprint() > treshold:
                selected_pairs.append(pair)
        return selected_pairs
                
    elif type_fingerprints == "MACCSkeys":
        for pair in list_of_pairs:
            if pair.get_simmilarity_using_MACCSKeys_Fingerprint() > treshold:
                selected_pairs.append(pair)
        return selected_pairs
                
    elif type_fingerprints == "MorganFingerprint":
        for pair in list_of_pairs:
            if pair.get_simmilarity_using_Morgan_Fingerprint() > treshold:
                selected_pairs.append(pair)
        return selected_pairs

Сначала создадим список молекул из базы L1000FWD.

In [7]:
L1000FWD_drug = pd.read_table('https://maayanlab.cloud/L1000FWD/download/Drugs_metadata.csv', sep = ',')
print('shape of dataframe L1000FWD_drug:', L1000FWD_drug.shape)
L1000FWD_drug.head()
L1000FWD_drug = L1000FWD_drug.dropna(subset = ['canonical_smiles', 'pert_iname', 'pubchem_cid'])

shape of dataframe L1000FWD_drug: (20449, 13)


In [8]:
molecules_L1000FWD_drug = []
for (smiles, name, cid) in zip(L1000FWD_drug['canonical_smiles'], L1000FWD_drug['pert_iname'], L1000FWD_drug['pubchem_cid']):
    molecule = Molecule(cid, name, smiles, 'L1000FWD') # представляем молекулы в виде объектов класса Molecule
    molecules_L1000FWD_drug.append(molecule)

In [9]:
len(molecules_L1000FWD_drug)

20363

Теперь создадим список молекул из базы CFM.

In [10]:
chemicals_CFM = pd.read_table('DATA/intesect_CFM_L1000FWD/direct_reprogramming_non-genetics - Chemicals.csv', sep = ',')
print('shape of dataframe chemicals_CFM:', chemicals_CFM.shape)
chemicals_CFM = chemicals_CFM.dropna(subset = ['cid', 'name'])
chemicals_CFM.head()

shape of dataframe chemicals_CFM: (158, 5)


Unnamed: 0,cid,link,name,Synonyms,MOA
0,459803,https://pubchem.ncbi.nlm.nih.gov/compound/459803,AC1LA18U,,Inhibitor of the HIF prolyl 4-hydroxylase
1,91899426,https://pubchem.ncbi.nlm.nih.gov/compound/9189...,2-phospho-L-ascorbic acid,2-phospho-L-ascorbic acid;BDBM92477,_
2,286003,https://pubchem.ncbi.nlm.nih.gov/compound/286003,DTXSID80301486,,Agonist of the Adenosine Receptor
3,47289,https://pubchem.ncbi.nlm.nih.gov/compound/47289,4-(Methylnitrosamino)-1-(3-pyridyl)-1-butanone,,_
4,451668,https://pubchem.ncbi.nlm.nih.gov/compound/451668,5-Aza-2'-deoxycytidine,Decitabine;5-Aza-2'-deoxycytidine;2353-33-5;Da...,Inhibitor of DNA methylation


In [11]:
def get_smiles_for_molecules_CFM(chemicals_CFM):
    """
    get smiles of molecules by cid of molecules and wtite smiles in file "smiles_of_molecules_CFM.txt"
    
    Parametrs
    ---------
    chemicals_CFM: dataframe with column with cids of molecules
    """
        
    str_with_cids = str()
    for cid in chemicals_CFM['cid']:
        if cid != '_':
            url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + str(cid) + '/property/CanonicalSMILES/csv'
            response = urllib.request.urlopen(url)
            str_response = response.read().decode('utf-8')
            list_responce = str_response.split(',')
            cid_with_extra_quotes = list_responce[-1]
            str_with_cids = str_with_cids + cid_with_extra_quotes
    with open("DATA/intesect_CFM_L1000FWD/smiles_of_molecules_CFM.txt", "w") as file:
        file.write(str_with_cids)

In [12]:
get_smiles_for_molecules_CFM(chemicals_CFM) # записали smiles в файл

In [12]:
def list_molecules_chemicals_CFM(chemicals_CFM, file_with_smiles):
    """
    make list of molecules, in which each molecule belongs class Molecule
    
    Parametrs
    ---------
    chemicals_CFM: dataframe with column with cids, names of molecules
    file_with_smiles: file with smiles of molecules
    
    Return
    ------
    list of molecules, in which each molecule belongs class Molecule
    """
    
    list_smiles_of_molecules_CFM = []
    with open(file_with_smiles, "r") as file:
        for line in file:
            list_smiles_of_molecules_CFM.append(line[1:-2])
        
    molecules_chemicals_CFM = []
    for (name, cid, smiles) in zip(chemicals_CFM['name'], chemicals_CFM['cid'], list_smiles_of_molecules_CFM):
        molecule = Molecule(cid, name, smiles, 'CFM')
        molecules_chemicals_CFM.append(molecule)
    return molecules_chemicals_CFM

In [13]:
molecules_chemicals_CFM = list_molecules_chemicals_CFM(chemicals_CFM, 'DATA/intesect_CFM_L1000FWD/smiles_of_molecules_CFM.txt')

Создадим список, в котором представлены всевозможные пары молекул (первая молекула принадлежит базе L1000FWD, вторая молекула CFM). Элементом списка является объект, принадлежащий классу Pair.

In [14]:
list_pairs = get_list_pairs_moleculs(molecules_L1000FWD_drug, molecules_chemicals_CFM)

In [15]:
len(list_pairs)

20363

Для примера отберем пары молекул, коэффициент сходства которых больше 0,7 и используются Morgan fingerprints.

In [18]:
selected_pairs = select_pairs_by_treshold(list_pairs, "MorganFingerprint", treshold = 0.7)

ArgumentError: Python argument types in
    rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(NoneType, int)
did not match C++ signature:
    GetMorganFingerprintAsBitVect(RDKit::ROMol mol, int radius, unsigned int nBits=2048, boost::python::api::object invariants=[], boost::python::api::object fromAtoms=[], bool useChirality=False, bool useBondTypes=True, bool useFeatures=False, boost::python::api::object bitInfo=None)

In [18]:
print('число пар :', len(selected_pairs))

число пар : 161


Посмотрим таблицу для клеточных переходов.

In [19]:
structure_CFM = pd.read_table('direct_reprogramming_non-genetics - structure.csv', sep = ',')
print('shape :', structure_CFM.shape)
structure_CFM.head()

shape : (169, 19)


Unnamed: 0,DOI,Species,Source Cell Type,Target Cell Type,"Duration, d","name of chemical 1,CID 1;name of chemical 2,CID 2",Medium,Growth Factors,Initial Culture (text),MoA chem 1; MoA chem 2,TFs/miRNA,Title,Assocoated Data,Yield (%),Comment on Yield,Type,Comment,"Authors;sorted=""Reverse"";sep=;",Stress factors
0,10.1371/journal.pone.0089678,Mus musculus,Fibroblasts,Induced Cardiomyocytes,14,"SB431542,4521392",doxycycline (2 µg mL−1) and the PGK-H2B-m Cher...,_,"MEFs, isolated at E14.5",Inhibitor of the Activin/BMP/TGF-β pathway,_,Inhibition of TGFβ signaling increases direct ...,GSE54022,16.95,,2,_,John D. Gearhart; Jonathan A. Epstein; Russell...,_
1,10.1007/s12015-013-9477-9,Sus scrofa,Fibroblasts,Induced Pancreatic Beta Cells,36,"5-Aza-2'-deoxycytidine,9444","N2B27 with 0.1 mM β- mercaptoetanol (Sigma), 2...",basic fibroblast growth factor 20 ng/ml,Primary porcine skin fibroblast cultures were ...,Inhibitor of DNA methylation,_,Reprogramming of pig dermal fibroblast into in...,_,38.1,Counting insulin labeled cells with a flow cyt...,1,_,F.; Gandolfi; T. A. L.; Brevini; M. M.; Rahman...,_
2,10.1038/ncomms3183,Homo sapiens,Fibroblasts,Induced Cholinergic Neurons,28,"Forskolin,47936;Dorsomorphin,11524144",_,FGF2 (10 ng ml−1),Human fibroblasts were obtained from commercia...,"CAMP agonist;Inhibitor of ALK2, ALK3, ALK6",NGN2,Small Molecules Enable Neurogenin 2 to Efficie...,GSE45954,57.2,,2,_,Oliver Brüstle; Philipp Koch; Franz-Josef Müll...,_
3,10.1016/j.biocel.2013.04.022,Homo sapiens,Mesenchymal Stem Cells (Bone marrow derived),Induced Neurons,28,"SB431542,4521392;Dorsomorphin,11524144",NeuroCult supplemented with 1% penicillin/stre...,10 ng/ml recombinant human bFGF,8-week fetal forebrain,Inhibitor of the Activin/BMP/TGF-β pathway;Inh...,_,Enhancing the efficiency of direct reprogrammi...,_,1.0,"Sodium current, Electrophysiological recordings",1,_,Zhiying Zhang; Qing-song Liu;Alexanian Arshak R.,_
4,10.1371/journal.pone.0003531,Mus musculus,Primordial Germ Cells,Induced Pluripotent Epiblast Cells,10,"Trichostatin A,444732","LIF (1200 IU/ml), FGF-2 (25 ng/ml)",FGF-2 (25 ng/ml),"EG cell lines, 8.5 EGC-1 and 4-3 Rosa, were de...",Inhibitor of the histone deacetylases I and II,_,Reprogramming Primordial Germ Cells into Pluri...,_,30.0,cells were stained for TNAP activity and the n...,2,_,M. Azim Surani; Reuben Tooze; Gina Doody; Fuch...,_


На основе этой таблицы создадим таблицу, в которой к каждому клеточному переходу укажем пары молекул (первая молекула принадлежит базе L1000FWD, вторая молекула - CFM), которые могут осуществить этот переход, исходя из того, что в таблице structure_CFM указаны cids молекул из базы CFM, которые обладают потенциал для перепрограммирования. То есть, если вторая молекула из пары указана для клеточного перехода, то считаем что первая молекула  из пары тоже может осуществить этот переход.

In [20]:
dataframe_selected_pairs_with_features = structure_CFM.iloc[:, 0:6]
dataframe_selected_pairs_with_features['smiles of chemicals'] = 'not molecules'
for pair in selected_pairs:
    for i in range(structure_CFM.shape[0]):
        string_molecules = structure_CFM.iloc[i, 5]
        list_molecules = string_molecules.split(';')
        list_cids = []
        for mol in list_molecules:
            list_cids.append(mol.split(',')[-1])
        if pair.second_compound.cid in list_cids:
            if dataframe_selected_pairs_with_features.iloc[i, 6] == 'not molecules':
                dataframe_selected_pairs_with_features.iloc[i, 6] = pair.first_compound.smiles + ';'+ pair.second_compound.smiles
            else :
                dataframe_selected_pairs_with_features.iloc[i, 6] = dataframe_selected_pairs_with_features.iloc[i, 6] + ';' + pair.first_compound.smiles + ';'+ pair.second_compound.smiles

Посмтрим на итоговую таблицу для клеточных переходов.

In [21]:
print('shape :', dataframe_selected_pairs_with_features.shape)
dataframe_selected_pairs_with_features.head()

shape : (169, 7)


Unnamed: 0,DOI,Species,Source Cell Type,Target Cell Type,"Duration, d","name of chemical 1,CID 1;name of chemical 2,CID 2",smiles of chemicals
0,10.1371/journal.pone.0089678,Mus musculus,Fibroblasts,Induced Cardiomyocytes,14,"SB431542,4521392",ONC(=O)CCCCCN1C(=O)c2cccc3cccc(C1=O)c23;C1=CC2...
1,10.1007/s12015-013-9477-9,Sus scrofa,Fibroblasts,Induced Pancreatic Beta Cells,36,"5-Aza-2'-deoxycytidine,9444",not molecules
2,10.1038/ncomms3183,Homo sapiens,Fibroblasts,Induced Cholinergic Neurons,28,"Forskolin,47936;Dorsomorphin,11524144",CC(=O)O[C@H]1[C@@H](O)C2C(C)(C)CC[C@H](O)[C@]2...
3,10.1016/j.biocel.2013.04.022,Homo sapiens,Mesenchymal Stem Cells (Bone marrow derived),Induced Neurons,28,"SB431542,4521392;Dorsomorphin,11524144",ONC(=O)CCCCCN1C(=O)c2cccc3cccc(C1=O)c23;C1=CC2...
4,10.1371/journal.pone.0003531,Mus musculus,Primordial Germ Cells,Induced Pluripotent Epiblast Cells,10,"Trichostatin A,444732",not molecules


Отфильтруем переходы, для которых не были найдены пары молекул.

In [22]:
print('shape :', dataframe_selected_pairs_with_features[dataframe_selected_pairs_with_features['smiles of chemicals'] != 'not molecules'].shape) 
dataframe_selected_pairs_with_features[dataframe_selected_pairs_with_features['smiles of chemicals'] != 'not molecules'].head()

shape : (148, 7)


Unnamed: 0,DOI,Species,Source Cell Type,Target Cell Type,"Duration, d","name of chemical 1,CID 1;name of chemical 2,CID 2",smiles of chemicals
0,10.1371/journal.pone.0089678,Mus musculus,Fibroblasts,Induced Cardiomyocytes,14,"SB431542,4521392",ONC(=O)CCCCCN1C(=O)c2cccc3cccc(C1=O)c23;C1=CC2...
2,10.1038/ncomms3183,Homo sapiens,Fibroblasts,Induced Cholinergic Neurons,28,"Forskolin,47936;Dorsomorphin,11524144",CC(=O)O[C@H]1[C@@H](O)C2C(C)(C)CC[C@H](O)[C@]2...
3,10.1016/j.biocel.2013.04.022,Homo sapiens,Mesenchymal Stem Cells (Bone marrow derived),Induced Neurons,28,"SB431542,4521392;Dorsomorphin,11524144",ONC(=O)CCCCCN1C(=O)c2cccc3cccc(C1=O)c23;C1=CC2...
5,10.2217/rme.10.67,Homo sapiens,Mesenchymal Stem Cells (Bone marrow derived),Induced Neurons,28,"Trichostatin A,444732;5-Aza-2'-deoxycytidine,1...",CC(=O)O[C@H]1[C@@H](O)C2C(C)(C)CC[C@H](O)[C@]2...
6,10.1002/hep.23506,Homo sapiens,Embryonic stem cells,Induced Hepatocytes,20,"Retinoic acid,444795",OC(=O)[C@H](Cc1c[nH]c2ccccc12)N1C(=O)c2ccccc2C...


In [23]:
dataframe_selected_pairs_with_features.iloc[2,6]

'CC(=O)O[C@H]1[C@@H](O)C2C(C)(C)CC[C@H](O)[C@]2(C)[C@@]3(O)C(=O)C[C@@](C)(O[C@]13C)C=C;CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)O)C)O)(C)C)O;CC(=O)O[C@H]1[C@@H](O)[C@H]2C(C)(C)CC[C@H](O)[C@]2(C)[C@@]3(O)C(=O)C[C@@](C)(O[C@]13C)C=C;CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)O)C)O)(C)C)O;CC(=O)OC1C(O)C2C(C)(C)CCC(O)C2(C)C2(O)C(=O)CC(C)(OC12C)C=C;CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)O)C)O)(C)C)O;CC(=O)O[C@H]1[C@@H](O)[C@H]2C(C)(C)CC[C@@H](O)[C@]2(C)[C@@]3(O)C(=O)C[C@@](C)(O[C@]13C)C=C;CC(=O)OC1C(C2C(CCC(C2(C3(C1(OC(CC3=O)(C)C=C)C)O)C)O)(C)C)O;C(CN1CCCCC1)Oc2ccc(cc2)c3cnc4c(cnn4c3)c5ccncc5;C1CCN(CC1)CCOC2=CC=C(C=C2)C3=CN4C(=C(C=N4)C5=CC=NC=C5)N=C3'

In [24]:
with open("DATA/intesect_CFM_L1000FWD/table_of_cell_conversion_and_chemicals.csv", "w") as file:
    dataframe_selected_pairs_with_features.to_csv("DATA/intesect_CFM_L1000FWD/table_of_cell_conversion_and_chemicals.csv")

Объединим одинаковые клеточные переходы, которые соответствуют разным видам.