In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd

import pickle

from rdkit import Chem
from rdkit.Chem import AllChem

import warnings
warnings.filterwarnings("ignore")

import xgboost
import KeyCompoundFinder

Load Dataset

In [2]:
df_test = pd.read_csv('../data/mini-test.csv')
df_original = pd.read_csv('../data/data.csv')

PatentNetML

In [3]:
def feature_selection_saved(X,file):
    with open(file,'rb') as f:
        process = pickle.load(f)
    X = process.get('fs1').transform(X.values)
    X = process.get('fs2').transform(X)
    df = pd.DataFrame(X,columns=process.get('fs3'))
    columns = process.get('fs4')
    df_final = df[columns]
    X_final = np.array(df_final)
    return X_final,columns

algorithms = ['xgboost','rf']            
dict_rounds_get = {'xgboost': [109,143,17,28,79],
                   'rf'     : [191,176,89,66,162]}

# print(algorithms)
# print(dict_rounds_get)

c = 0
dict_nml_p_ml = {}

df_probability_total = pd.DataFrame()
for p in df_test.PATENT_ID.unique():

    df_p = df_test[df_test.PATENT_ID == p].reset_index(drop=True)

    n = df_p.shape[0]

    X_ = df_p.drop(columns=['PATENT_ID','P_Ca_SMILES','Target'])

    y = df_p.Target.tolist()
    
    df_probability = pd.DataFrame({'PATENT_ID':df_p.PATENT_ID.tolist(),
                                   'P_Ca_SMILES':df_p.P_Ca_SMILES.tolist(),
                                   'Target':df_p.Target.tolist()})
    for algorithm in algorithms:
        path = f'../results/models/{algorithm}_top5'
        for r in dict_rounds_get.get(algorithm):
            file=f'{path}/FeatureSelection_{r}.pkl'
            X,columns = feature_selection_saved(X_,file)


            with open(f'{path}/{algorithm}_{r}.pkl','rb') as f:
                clf = pickle.load(f)

            y_proba = clf.predict_proba(X)[:,1]
            df_probability[f'Probability_{algorithm}_{r}'] = y_proba
            
        if algorithm == algorithms[0]:
            col_num = df_probability.shape[1]
            df_probability[f'Probability_{algorithm}'] = [np.mean(df_probability.iloc[i,3:].tolist()) for i in range(df_probability.shape[0])]
        elif algorithm == algorithms[1]:
            df_probability[f'Probability_{algorithm}'] = [np.mean(df_probability.iloc[i,col_num+1:].tolist()) for i in range(df_probability.shape[0])]

    df_probability['Probability'] = [np.mean(df_probability.iloc[i,[col_num,-1]].tolist()) for i in range(df_probability.shape[0])]
    df_probability.sort_values(by=['Probability'],inplace=True,ascending=False,ignore_index=True)
    df_probability_total = pd.concat([df_probability_total,df_probability])
    
    c +=1
    for i in range(df_probability.shape[0]):
        if df_probability.Target[i] == 1:
            print(f'{c} {p} : {i+1} ({n})')
            dict_nml_p_ml[p] = i+1
            break

print('Num:',sum([i <=5 for i in dict_nml_p_ml.values()]))
print('Top5 Accuracy:',sum([i <=5 for i in dict_nml_p_ml.values()])/c)
print('Top1 Accuracy:',sum([i <=1 for i in dict_nml_p_ml.values()])/c)
print('Top10 Accuracy:',sum([i <=10 for i in dict_nml_p_ml.values()])/c)

1 US-20190016731 : 9 (117)
2 US-8242121 : 4 (65)
3 US-8354092 : 18 (203)
4 US-8063224 : 6 (50)
5 US-8415483 : 1 (93)
6 US-20140378489 : 47 (88)
7 US-8012956 : 67 (585)
8 US-20120232029 : 30 (81)
9 US-8877731 : 3 (162)
10 US-9359365 : 2 (648)
11 US-8809525 : 5 (228)
12 US-7572924 : 24 (208)
13 US-20130296364 : 117 (136)
14 US-4219559 : 105 (140)
Num: 5
Top5 Accuracy: 0.35714285714285715
Top1 Accuracy: 0.07142857142857142
Top10 Accuracy: 0.5


Traditional methods (CSA,MI,FOG)

In [4]:
test_patent_id  = ['US-8063224']
wdirs = '../results/traditional-example/'
if os.path.exists(wdirs):
    file_exist = os.listdir(wdirs)
else:
    os.makedirs(wdirs)
    
for p in test_patent_id:
    df_p = df_original[df_original.PATENT_ID == p].reset_index(drop=True)
    df_result = KeyCompoundFinder.FindKC(wdirs=wdirs,
                                         methods=[True,True,True], # CSA:True, MI:True, FOG:True
                                         params=[{'fp_class':'ECFP','fp_r':2,'fp_b':1024,'threshold':0.7},
                                                 {'timeout':1,'threshold':0.9},
                                                 {'timeout':60,'cluster':'a','fp_class':'ECFP','fp_r':2,'fp_b':1024,'threshold':0.6}
                                                ],
                                         df=df_p,
                                        )
df_result

Molecular Idol is calculating: 0 in 50.
Molecular Idol is calculating: 10 in 50.
Molecular Idol is calculating: 20 in 50.
Molecular Idol is calculating: 30 in 50.
Molecular Idol is calculating: 40 in 50.


Unnamed: 0,DrugBank_ID,Name,Drug_Groups,SMILES,Ca_SMILES,PATENT_ID,HOMOGENEITY,PCT_FILTERED,CSA_in_Round5,CSA_Rank,...,CSA_in_Top5,MIdol_Rank,MIdol_Neighbor,MIdol_Score,MIdol_in_Top5,FOG_auto_in_Top5&Top2,FOG_Cluster,FOG_Rank,FOG_Rank_Share,FOG_Score
0,DB13024,MK-8245,investigational,OC(=O)CN1N=NC(=N1)C1=CC(=NO1)N1CCC(CC1)OC1=C(B...,O=C(O)Cn1nnc(-c2cc(N3CCC(Oc4cc(F)ccc4Br)CC3)no...,US-8063224,0.675,0.188,1,19,...,0,11,2,1.868817,0,1,1,1,1,75
