In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit import DataStructs
%matplotlib inline

In [None]:
#load data
data = pd.read_csv('drug_smi.txt', sep = '\t')

In [None]:
PandasTools.AddMoleculeColumnToFrame(data,'smiles','Molecule',includeFingerprints=True)
print([str(x) for x in  data.columns])

In [None]:
#Dihydrophenanthrene = Chem.MolFromSmiles("C1CC2=CC=CC=C2C3=CC=CC=C31")
Luteolin = pd.read_csv("natural_product/Luteolin.csv")
Luteolin

In [None]:
PandasTools.AddMoleculeColumnToFrame(Luteolin,'smiles','Molecule',includeFingerprints=True)
print([str(x) for x in  Luteolin.columns])

In [None]:
fplist = [] #fplist
for mol in data['Molecule']:
    fp = Chem.GetMorganFingerprintAsBitVect(mol,2)
    fplist.append(fp)

In [None]:
Luteolin_fplist = [] #fplist
for mol in Luteolin['Molecule']:
    Luteolin_fp = Chem.GetMorganFingerprintAsBitVect(mol,2)
    Luteolin_fplist.append(Luteolin_fp)

In [None]:
data['mfp2']=fplist
data.head(3)

In [None]:
Luteolin['mfp2']=Luteolin_fplist
Luteolin.head(3)

In [None]:
simlist = []
for o in Luteolin.index:
    print(o)
    for r in data.index:
        fp1 = Luteolin['mfp2'][o]
        fp2 = data['mfp2'][r]
        sim =DataStructs.DiceSimilarity(fp1,fp2)
        simlist.append(Luteolin['pubchem_id'][o] + "," + str(round(sim,3)) + "," + data['drug_id'][r])

In [None]:
new_data = pd.DataFrame(simlist)
new_data['head'] = new_data[0].map(lambda x:x.split(",")[0])
new_data['relation'] = new_data[0].map(lambda x:x.split(",")[1])
new_data['tail'] = new_data[0].map(lambda x:x.split(",")[2])

In [None]:
new_data = new_data.drop(0,axis=1)

In [None]:
new_data

In [None]:
df = pd.read_csv("KG/dt_graph.txt",sep="\t",names=["head","relation","tail"])

In [None]:
merge_data = pd.concat([df,new_data],axis=0,ignore_index=True)
merge_data

In [None]:
merge_data.to_csv("KG/Luteolin/Luteolin_kg.txt",index=None,header=None,sep="\t")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import json

In [None]:
rels = set()
ents = set()

In [None]:
with open('KG/Luteolin/Luteolin_kg.txt','rb') as f:
    lines = f.readlines()
    for line in lines:
        line = line.rstrip()
        rel = str(line.decode().split('\t')[1])
        e1 = str(line.decode().split('\t')[0])
        e2 = str(line.decode().split('\t')[2])
        rels.add(rel)
        ents.add(e1)
        ents.add(e2)

f.close()

In [None]:
relationid = {}
for idx, item in enumerate(list(rels)):
	relationid[item] = idx

In [None]:
entid = {}
for idx, item in enumerate(list(ents)):
    entid[item] = idx

In [None]:
f=open("KG/Luteolin/relations.tsv","w")
 
for k,v in relationid.items():
    f.write(str(v) + '\t' + k +"\n")
f.close()

In [None]:
f=open("KG/Luteolin/entities.tsv","w")
 
for k,v in entid.items():
    f.write(str(v) + '\t' + k +"\n")
f.close()

In [None]:
shuffle_data = merge_data.sample(frac=1.0)
shuffle_data = shuffle_data.reset_index(drop=True)

In [None]:
shuffle_data

In [None]:
int(len(shuffle_data)*0.2)

In [None]:
int(len(shuffle_data)*0.2) * 2

In [None]:
len(shuffle_data) - 215662

In [None]:
len(shuffle_data)

In [None]:
107831 + 107831

In [None]:
train = shuffle_data.loc[0:107831]
test = shuffle_data.loc[107831:215662]
valid = shuffle_data.loc[215662:539156]

In [None]:
train.to_csv("Luteolin/train.txt",index=None,header=None,sep="\t")
valid.to_csv("Luteolin/valid.txt",index=None,header=None,sep="\t")
test.to_csv("Luteolin/test.txt",index=None,header=None,sep="\t")