#Feature importance

In [None]:
!pip install rfpimp

## 2.1 Data selection by ML feature importance. After you choosing the top important features, you can apply statistical methods for filtering outliers of the dataset. (see 2.2 for details)  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

data=pd.read_csv('/content/eli_dupli_padel_D.csv')

target_list=['Molecular Weight','AlogP','NumHDonors','TPSA','NumHAcceptors','FractionCSP3','NumRotatableBonds','LogP_WildmanCrippen']

In [None]:
from sklearn.ensemble import RandomForestRegressor
import rfpimp
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
tar=target_list
#print(tar)
x= data[tar]
y= data['STANDARD_VALUE_LN']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 8259)
rf.fit(x_train, y_train)
imp = rfpimp.importances(rf, x_test, y_test)

y_pred_mlr=rf.predict(x_test)
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

In [None]:
fig, ax = plt.subplots(figsize=(6, 3))

ax.barh(imp.index, imp['Importance'], height=0.8, facecolor='grey', alpha=0.8, edgecolor='k')
ax.set_xlabel('Importance score')
ax.set_title('Permutation feature importance')

plt.gca().invert_yaxis()

fig.tight_layout()

In [None]:
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error

tar=target_list
x= data[tar]
y= data['STANDARD_VALUE_LN']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=13)

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

y_pred_mlr=reg.predict(X_test)
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

In [None]:
feature_importance = reg.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + 0.5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align="center")
plt.yticks(pos, np.array(target_list)[sorted_idx])
plt.title("Feature Importance (MDI)")

result = permutation_importance(
    reg, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2
)
sorted_idx = result.importances_mean.argsort()
plt.subplot(1, 2, 2)
plt.boxplot(
    result.importances[sorted_idx].T,
    vert=False,
    labels=np.array(target_list)[sorted_idx],
)
plt.title("Permutation Importance (test set)")
fig.tight_layout()
plt.show()

# 2.2 Data selection by descriptors (IQR method)

In [1]:
from torch.cuda.memory import reset_max_memory_allocated
import numpy as np
test= pd.read_csv('/content/eli_dupli_padel_D.csv')

Q1M = test['Molecular Weight'].quantile(0.25)
Q3M = test['Molecular Weight'].quantile(0.75)
IQRM = Q3M - Q1M
lowerM = Q1M - 1.5*IQRM
upperM = Q3M + 1.5*IQRM
# Create arrays of Boolean values indicating the outlier rows
upper_arrayM = np.where(test['Molecular Weight']>=upperM)[0]
lower_arrayM = np.where(test['Molecular Weight']<=lowerM)[0]

# IQR method, see: https://medium.com/@pp1222001/outlier-detection-and-removal-using-the-iqr-method-6fab2954315d
Q1T = test['TPSA'].quantile(0.25)
Q3T = test['TPSA'].quantile(0.75)
IQRT = Q3T - Q1T
lowerT = Q1T - 1.5*IQRT
upperT = Q3T + 1.5*IQRT
upper_arrayT = np.where(test['TPSA']>=upperT)[0]
lower_arrayT = np.where(test['TPSA']<=lowerT)[0]

Q1A = test['AlogP'].quantile(0.25)
Q3A = test['AlogP'].quantile(0.75)
IQRA = Q3A - Q1A
lowerA = Q1A - 1.5*IQRA
upperA = Q3A + 1.5*IQRA
upper_arrayA = np.where(test['AlogP']>=upperA)[0]
lower_arrayA = np.where(test['AlogP']<=lowerA)[0]

NameError: name 'pd' is not defined

In [None]:
up_rm_MT= np.union1d(upper_arrayM, upper_arrayT)
lw_rm_MT= np.union1d(lower_arrayM, lower_arrayT)
s2=np.union1d(up_rm_MT,lw_rm_MT)
# Removing the outliers
test.drop(index=s2, inplace=True)
test.to_csv('Qrm_MW_TPSA.csv', index=False)

up_rm_MA= np.union1d(upper_arrayM, upper_arrayA)
lw_rm_MA= np.union1d(lower_arrayM, lower_arrayA)
s2=np.union1d(up_rm_MA,up_rm_MT)
s3=np.union1d(lw_rm_MA,lw_rm_MT)
s4=np.union1d(s2,s3)
# Removing the outliers
test.drop(index=s4, inplace=True)
test.to_csv('Qrm_MW_AlogP_TPSA.csv', index=False)

# 2.3 Data selection by N3/N7 ring

In [None]:
test= pd.read_csv('/content/rm_mol.csv')
# About N3/ N7 ring substrutures, please read thesis_R09945062
smiles, target= test['smiles'], test['half_life']
N3, N7= test['n3_ring'], test['n7_ring']
N3= N3.astype(np.bool)
N7= N7.astype(np.bool)
N37= np.logical_or(N3, N7)
smiles_rm_N3= smiles[~N3]
target_rm_N3= target[~N3]
smiles_rm_N7= smiles[~N7]
target_rm_N7= target[~N7]
smiles_rm_N37= smiles[~N37]
target_rm_N37= target[~N37]

df = pd.DataFrame(list(zip(smiles_rm_N3, target_rm_N3)), columns =['smiles', 'half_life'])
df.to_csv('training_rm3.csv', index=False)
df2 = pd.DataFrame(list(zip(smiles_rm_N7, target_rm_N7)), columns =['smiles', 'half_life'])
df2.to_csv('training_rm7.csv', index=False)
df3 = pd.DataFrame(list(zip(smiles_rm_N37, target_rm_N37)), columns =['smiles', 'half_life'])
df3.to_csv('training_rm37.csv', index=False)

# 2.4 Data selection by wwl distance matrix

In [None]:
X= np.load('wasserstein_distance_matrix_it6.npy')
dis=[]
for i in X:
  d= np.sum(i)
  dis.append(d)
  #print(d)
flattened_distances= np.array(dis)
q1 = np.percentile(flattened_distances, 25)
q3 = np.percentile(flattened_distances, 75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Find the indices of outliers in the flattened distance matrix
outlier_indices = np.where((flattened_distances < lower_bound) | (flattened_distances > upper_bound))
print(outlier_indices)

In [None]:
df= pd.read_csv('/content/eli_duplcate_hf.csv')
df2= df.drop(outlier_indices[0])
df2.to_csv('drop_by_q1q3.csv', index=False)

# 2.5 Even class training

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import KMeansSMOTE, RandomOverSampler
from sklearn.datasets import make_blobs

# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.RandomOverSampler.html
def over_dataset(file_path: str, out_file_name: str):
  # input needs to have class labels
  test_df= pd.read_csv(file_path)
  # Here we use cluster labels as classes labels to adapt the training technique of classification
  X, y= np.array(test_df['smiles']).reshape(-1, 1) , np.array(test_df['cluster_label']).reshape(-1, 1)
  # this will keep the original class distribution of dataset in training and testing set, respectivly
  sss = StratifiedShuffleSplit(n_splits=2, test_size=0.1, random_state=1348)

  train_index, test_index = next(sss.split(X, y))
  train_laebl= test_df['cluster_label'][train_index]
  test_laebl= test_df['cluster_label'][test_index]
  # get size of each class
  train_cluster_ids, train_cluster_sizes = np.unique(train_laebl, return_counts=True)
  test_cluster_ids, test_cluster_sizes = np.unique(test_laebl, return_counts=True)
  result_split= pd.DataFrame.from_dict({'cluster_label':train_cluster_ids, 'train_size_ratio':np.array(train_cluster_sizes/len(train_index)), 'test_size_ratio':np.array(test_cluster_sizes/len(test_index))})
  print(result_split)
  # 26 cluster in kmeans
  ros= RandomOverSampler(random_state=42)
  #In default setting, this RandomOverSampler will oversampling the minor class until size of all classes are the same
  X_res, y_res = ros.fit_resample(X[train_index], y[train_index])
  print('oversample size: ', len(X_res))
  cluster_ids, cluster_sizes = np.unique(y_res, return_counts=True)
  print(cluster_sizes)
  print(cluster_ids)

  df_dict= dict(zip(test_df['smiles'], test_df['half_life']))
  tr_hf= np.array([df_dict[str(i[0])] for i in X_res])
  te_hf= np.array([df_dict[str(i[0])] for i in X[test_index]])
  ts=set([str(i[0]) for i in X_res])
  tes=set([str(i[0]) for i in X[test_index]])
  print('test smiles duplicate: ', ts.intersection(tes))

  tr= pd.DataFrame()
  tr['smiles']= np.array([str(i[0]) for i in X_res])
  tr['half_life']= tr_hf
  tr['labels']= y_res
  print(tr.shape)
  #print(tr)
  te= pd.DataFrame()
  te['smiles']= np.array([str(i[0]) for i in X[test_index]])
  te['half_life']= te_hf
  te['labels']= y[test_index]
  print(te.shape)
  #print(te)

  tr.to_csv(f'{out_file_name}_train.csv', index= False)
  te.to_csv(f'{out_file_name}_test.csv', index= False)

In [None]:
over_dataset('/content/kmean_df.csv', 'over_kmean')

# 2.6 Scaffold analysis

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdFMCS

df= pd.read_csv('/content/kmean_sus.csv', sep='\t')
group= np.unique(df['group'])

smi=[]
for i in group:
  print(i)
  s= df['smiles'][df['group']==i]
  smi.append({str(i):list(s)})

smi8_10_1= smi[0]['8.1'].index('CCN(CC)CCCC(C)Nc1ccnc2cc(C(F)(F)F)ccc12')
smi8_10_2= smi[0]['8.1'].index('COc1cc2nc(Nc3ccc(F)cc3)nc(NCC3CCCO3)c2cc1OC')
print(smi8_10_1, smi8_10_2)
print(smi)
s1= smi[0]['8.1'].pop(smi8_10_1)
s2= smi[0]['8.1'].pop(smi8_10_2)
smi.insert(1, {'8_10':[s1,s2]})
smi.pop(3)

In [None]:
smi

## 2.6.1 Find MCS without scaffold

In [None]:
smart_temp=[]
smart_mol=[]
for i in range(len(smi)):
  print(str(smi[i].keys()))
  smis= list(smi[i].values())
  print(smis[0])
  mols = [Chem.MolFromSmiles(i) for i in smis[0]]
  #plain MCS
  res = rdFMCS.FindMCS(mols, atomCompare=rdFMCS.AtomCompare.CompareAny, bondCompare=rdFMCS.BondCompare.CompareAny).smartsString
  m = Chem.MolFromSmarts(res)
  smart_temp.append(res)
  smart_mol.append(m)
img = Draw.MolsToGridImage(smart_mol, subImgSize=(250, 250), molsPerRow=5)
img
with open(f'MCS.png','wb+') as outf:
  outf.write(png)

# 2.6.2 Find MCS after getting scaffold

In [None]:
core_temp_free=[]
core_mol_free=[]
for i in range(len(smi)):
  print(str(smi[i].keys()))
  smis= list(smi[i].values())
  print(smis[0])
  mols = [Chem.MolFromSmiles(i) for i in smis[0]]
  temp_mol=[]
  for j in mols:
    core = MurckoScaffold.GetScaffoldForMol(j)
    smi_temp= MurckoScaffold.MurckoScaffoldSmiles(mol=j)
    #temps.append(smi_temp)
    temp_mol.append(core)
  res = rdFMCS.FindMCS(temp_mol, atomCompare=rdFMCS.AtomCompare.CompareAny, bondCompare=rdFMCS.BondCompare.CompareAny).smartsString
  core_temp_free.append(res)
  m = Chem.MolFromSmarts(res)
  print(res)
  core_mol_free.append(m)
img_free = Draw.MolsToGridImage(core_mol_free, subImgSize=(250, 250), molsPerRow=5)
img_free
png = img_free.data
with open(f'MCS_under_scaffold.png','wb+') as outf:
  outf.write(png)

#2.7 Frequency

In [None]:
whole_smi= set(pd.read_csv('/content/eli_duplcate_hf.csv')['smiles'])
frq_free=[]
samp_free=[]
comp_mol=[]
for i in range(len(smi)):
  print(str(smi[i].keys()))
  smis= list(smi[i].values())
  gs= set(smis[0])
  print(len(gs))
  mols = whole_smi.difference(gs)
  print(len(mols))
  comp_mol.append(mols)

for t in range(len(core_temp_free)):
  patt = Chem.MolFromSmarts(core_temp_free[t])
  rank=[]
  for i in comp_mol[t]:
    mol = Chem.MolFromSmiles(i)
    rk= mol.HasSubstructMatch(patt)
    rank.append(rk)
  rank_ids, rank_sizes = np.unique(rank, return_counts=True)
  frq_free.append(np.array(rank_sizes)/total)
  print(rank_ids)
  print(rank_sizes)
  samp_free.append(rank_sizes)

print(frq_free)