In [None]:
import rdkit
from rdkit.Chem import PandasTools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit

stFile = r"C:\Users\am7574\OneDrive - Corteva\Documents\Projects\AI_chemistry\TK_AI\TK_ALL.sdf"
data = PandasTools.LoadSDF(stFile)
dataMol = data[["ROMol","IC50 uM"]]
dataMol = dataMol.dropna()
dataMol['activity'] = dataMol['IC50 uM'].apply(lambda x: 0 if x == 'NI' or float(x) >= 20 else 1)

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def calcFingerprint(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    dataMol["MolFP"] = sdData["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return dataMol

def calcProp(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    from rdkit.ML.Descriptors import MoleculeDescriptors
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in rdkit.Chem.Descriptors._descList])
    dataMol["prop"] = sdData["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return dataMol

def calcFingerprint(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    mols["MolFP"] = mols["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return mols

def calcProp(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.ML.Descriptors import MoleculeDescriptors

    des_name = [x[0] for x in rdkit.Chem.Descriptors._descList if (x[0].find("PartialCharge") == -1 and x[0].find("BCUT2D")==-1 and x[0].find("Morgan") == -1)]
    #print(des_name)  
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(des_name)
    mols["prop"] = mols["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return mols

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def getBestModel(X_train, y_train, X_test, y_test):
    from sklearn import pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.svm import SVC
    import numpy as np    
    pipe = pipeline.Pipeline([('classifier', RandomForestClassifier())])
    search_space = [{'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__max_features': [1, 2, 3]},
                    {'classifier': [LogisticRegression(solver='liblinear')],
                     'classifier__penalty': ['l1', 'l2'],
                     'classifier__C': np.logspace(0, 4, 10)},
                    {'classifier': [GradientBoostingClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__learning_rate': [0.001, 0.01, 0.1],
                     'classifier__max_depth': [1, 2, 3]},
                     {'classifier': [AdaBoostClassifier()]},
                        {'classifier': [SVC()],
                        'classifier__C': [0.1, 1, 10],
                        'classifier__gamma': [1, 0.1, 0.01],
                        'classifier__kernel': ['rbf', 'poly', 'sigmoid']},
                    ]
    
    clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
    best_model = clf.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    assessModel(y_test, y_pred)
    return best_model

def getBestfeatures(X,y):
    from sklearn.feature_selection import SelectKBest, mutual_info_classif
    X_new = SelectKBest(mutual_info_classif, k=30).fit_transform(X, y)
    print(X_new.shape)
    return X_new

def getBestFeatureSFS(X,y):
    from sklearn.feature_selection import SequentialFeatureSelector
    from sklearn.ensemble import RandomForestClassifier
    sfs_fwd = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=30, direction='backward').fit(X, y)
    X_new = sfs_fwd.transform(X)
    print(X_new.shape)
    return X_new


X_fp = calcFingerprint(dataMol)
X = X_fp["MolFP"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

X_prop = calcProp(dataMol)
X = X_prop["prop"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

0.5833333333333334
0.9074074074074074
[[91  2]
 [ 8  7]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        93
           1       0.78      0.47      0.58        15

    accuracy                           0.91       108
   macro avg       0.85      0.72      0.77       108
weighted avg       0.90      0.91      0.90       108

(357, 30)


  _data = np.array(data, dtype=dtype, copy=copy,


0.5714285714285714
0.9166666666666666
[[93  0]
 [ 9  6]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        93
           1       1.00      0.40      0.57        15

    accuracy                           0.92       108
   macro avg       0.96      0.70      0.76       108
weighted avg       0.92      0.92      0.90       108



45 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 

0.0
0.8425925925925926
[[91  2]
 [15  0]]
              precision    recall  f1-score   support

           0       0.86      0.98      0.91        93
           1       0.00      0.00      0.00        15

    accuracy                           0.84       108
   macro avg       0.43      0.49      0.46       108
weighted avg       0.74      0.84      0.79       108

(357, 30)
0.2857142857142857
0.8611111111111112
[[90  3]
 [12  3]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92        93
           1       0.50      0.20      0.29        15

    accuracy                           0.86       108
   macro avg       0.69      0.58      0.60       108
weighted avg       0.83      0.86      0.83       108



' X_new = getBestFeatureSFS(X, y)\nX_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   \ngetBestModel(X_train, y_train, X_test, y_test) '

In [None]:
import rdkit
from rdkit.Chem import PandasTools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit

stFile = r"C:\Users\am7574\OneDrive - Corteva\Documents\Projects\AI_chemistry\TK_AI\TK_ALL.sdf"
data = PandasTools.LoadSDF(stFile)
dataMol = data[["ROMol","IC50 uM"]]
dataMol = dataMol.dropna()
dataMol['activity'] = dataMol['IC50 uM'].apply(lambda x: 0 if x == 'NI' or float(x) >= 20 else 1)

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def calcFingerprint(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    dataMol["MolFP"] = sdData["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return dataMol

def calcProp(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    from rdkit.ML.Descriptors import MoleculeDescriptors
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in rdkit.Chem.Descriptors._descList])
    dataMol["prop"] = sdData["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return dataMol

def calcFingerprint(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    mols["MolFP"] = mols["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return mols

def calcProp(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.ML.Descriptors import MoleculeDescriptors

    des_name = [x[0] for x in rdkit.Chem.Descriptors._descList if (x[0].find("PartialCharge") == -1 and x[0].find("BCUT2D")==-1 and x[0].find("Morgan") == -1)]
    #print(des_name)  
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(des_name)
    mols["prop"] = mols["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return mols

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def getBestModel(X_train, y_train, X_test, y_test):
    from sklearn import pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.svm import SVC
    import numpy as np    
    pipe = pipeline.Pipeline([('classifier', RandomForestClassifier())])
    search_space = [{'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__max_features': [1, 2, 3]},
                    {'classifier': [LogisticRegression(solver='liblinear')],
                     'classifier__penalty': ['l1', 'l2'],
                     'classifier__C': np.logspace(0, 4, 10)},
                    {'classifier': [GradientBoostingClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__learning_rate': [0.001, 0.01, 0.1],
                     'classifier__max_depth': [1, 2, 3]},
                     {'classifier': [AdaBoostClassifier()]},
                        {'classifier': [SVC()],
                        'classifier__C': [0.1, 1, 10],
                        'classifier__gamma': [1, 0.1, 0.01],
                        'classifier__kernel': ['rbf', 'poly', 'sigmoid']},
                    ]
    
    clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
    best_model = clf.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    assessModel(y_test, y_pred)
    return best_model

def getBestfeatures(X,y):
    from sklearn.feature_selection import SelectKBest, mutual_info_classif
    X_new = SelectKBest(mutual_info_classif, k=35).fit_transform(X, y)
    print(X_new.shape)
    return X_new

def getBestFeatureSFS(X,y):
    from sklearn.feature_selection import SequentialFeatureSelector
    from sklearn.ensemble import RandomForestClassifier
    sfs_fwd = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=30, direction='backward').fit(X, y)
    X_new = sfs_fwd.transform(X)
    print(X_new.shape)
    return X_new


X_fp = calcFingerprint(dataMol)
X = X_fp["MolFP"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

X_prop = calcProp(dataMol)
X = X_prop["prop"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

0.5833333333333334
0.9074074074074074
[[91  2]
 [ 8  7]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        93
           1       0.78      0.47      0.58        15

    accuracy                           0.91       108
   macro avg       0.85      0.72      0.77       108
weighted avg       0.90      0.91      0.90       108

(357, 35)
0.6923076923076923
0.9259259259259259
[[91  2]
 [ 6  9]]
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        93
           1       0.82      0.60      0.69        15

    accuracy                           0.93       108
   macro avg       0.88      0.79      0.83       108
weighted avg       0.92      0.93      0.92       108



45 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 

0.38095238095238093
0.8796296296296297
[[91  2]
 [11  4]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93        93
           1       0.67      0.27      0.38        15

    accuracy                           0.88       108
   macro avg       0.78      0.62      0.66       108
weighted avg       0.86      0.88      0.86       108

(357, 35)
0.45454545454545453
0.8888888888888888
[[91  2]
 [10  5]]
              precision    recall  f1-score   support

           0       0.90      0.98      0.94        93
           1       0.71      0.33      0.45        15

    accuracy                           0.89       108
   macro avg       0.81      0.66      0.70       108
weighted avg       0.88      0.89      0.87       108



  _data = np.array(data, dtype=dtype, copy=copy,


' X_new = getBestFeatureSFS(X, y)\nX_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   \ngetBestModel(X_train, y_train, X_test, y_test) '

In [None]:
import rdkit
from rdkit.Chem import PandasTools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit

stFile = r"C:\Users\am7574\OneDrive - Corteva\Documents\Projects\AI_chemistry\TK_AI\TK_ALL.sdf"
data = PandasTools.LoadSDF(stFile)
dataMol = data[["ROMol","IC50 uM"]]
dataMol = dataMol.dropna()
dataMol['activity'] = dataMol['IC50 uM'].apply(lambda x: 0 if x == 'NI' or float(x) >= 20 else 1)

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def calcFingerprint(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    dataMol["MolFP"] = sdData["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return dataMol

def calcProp(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    from rdkit.ML.Descriptors import MoleculeDescriptors
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in rdkit.Chem.Descriptors._descList])
    dataMol["prop"] = sdData["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return dataMol

def calcFingerprint(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    mols["MolFP"] = mols["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return mols

def calcProp(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.ML.Descriptors import MoleculeDescriptors

    des_name = [x[0] for x in rdkit.Chem.Descriptors._descList if (x[0].find("PartialCharge") == -1 and x[0].find("BCUT2D")==-1 and x[0].find("Morgan") == -1)]
    #print(des_name)  
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(des_name)
    mols["prop"] = mols["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return mols

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def getBestModel(X_train, y_train, X_test, y_test):
    from sklearn import pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.svm import SVC
    import numpy as np    
    pipe = pipeline.Pipeline([('classifier', RandomForestClassifier())])
    search_space = [{'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__max_features': [1, 2, 3]},
                    {'classifier': [LogisticRegression(solver='liblinear')],
                     'classifier__penalty': ['l1', 'l2'],
                     'classifier__C': np.logspace(0, 4, 10)},
                    {'classifier': [GradientBoostingClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__learning_rate': [0.001, 0.01, 0.1],
                     'classifier__max_depth': [1, 2, 3]},
                     {'classifier': [AdaBoostClassifier()]},
                        {'classifier': [SVC()],
                        'classifier__C': [0.1, 1, 10],
                        'classifier__gamma': [1, 0.1, 0.01],
                        'classifier__kernel': ['rbf', 'poly', 'sigmoid']},
                    ]
    
    clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
    best_model = clf.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    assessModel(y_test, y_pred)
    return best_model

def getBestfeatures(X,y):
    from sklearn.feature_selection import SelectKBest, mutual_info_classif
    X_new = SelectKBest(mutual_info_classif, k=40).fit_transform(X, y)
    print(X_new.shape)
    return X_new

def getBestFeatureSFS(X,y):
    from sklearn.feature_selection import SequentialFeatureSelector
    from sklearn.ensemble import RandomForestClassifier
    sfs_fwd = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=30, direction='backward').fit(X, y)
    X_new = sfs_fwd.transform(X)
    print(X_new.shape)
    return X_new


X_fp = calcFingerprint(dataMol)
X = X_fp["MolFP"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

X_prop = calcProp(dataMol)
X = X_prop["prop"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

0.5833333333333334
0.9074074074074074
[[91  2]
 [ 8  7]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        93
           1       0.78      0.47      0.58        15

    accuracy                           0.91       108
   macro avg       0.85      0.72      0.77       108
weighted avg       0.90      0.91      0.90       108

(357, 40)
0.72
0.9351851851851852
[[92  1]
 [ 6  9]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96        93
           1       0.90      0.60      0.72        15

    accuracy                           0.94       108
   macro avg       0.92      0.79      0.84       108
weighted avg       0.93      0.94      0.93       108



45 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 

0.3157894736842105
0.8796296296296297
[[92  1]
 [12  3]]
              precision    recall  f1-score   support

           0       0.88      0.99      0.93        93
           1       0.75      0.20      0.32        15

    accuracy                           0.88       108
   macro avg       0.82      0.59      0.62       108
weighted avg       0.87      0.88      0.85       108

(357, 40)
0.21052631578947367
0.8611111111111112
[[91  2]
 [13  2]]
              precision    recall  f1-score   support

           0       0.88      0.98      0.92        93
           1       0.50      0.13      0.21        15

    accuracy                           0.86       108
   macro avg       0.69      0.56      0.57       108
weighted avg       0.82      0.86      0.82       108



' X_new = getBestFeatureSFS(X, y)\nX_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   \ngetBestModel(X_train, y_train, X_test, y_test) '

In [None]:
import rdkit
from rdkit.Chem import PandasTools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit

stFile = r"C:\Users\am7574\OneDrive - Corteva\Documents\Projects\AI_chemistry\TK_AI\TK_ALL.sdf"
data = PandasTools.LoadSDF(stFile)
dataMol = data[["ROMol","IC50 uM"]]
dataMol = dataMol.dropna()
dataMol['activity'] = dataMol['IC50 uM'].apply(lambda x: 0 if x == 'NI' or float(x) >= 20 else 1)

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def calcFingerprint(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    dataMol["MolFP"] = sdData["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return dataMol

def calcProp(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    from rdkit.ML.Descriptors import MoleculeDescriptors
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in rdkit.Chem.Descriptors._descList])
    dataMol["prop"] = sdData["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return dataMol

def calcFingerprint(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    mols["MolFP"] = mols["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return mols

def calcProp(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.ML.Descriptors import MoleculeDescriptors

    des_name = [x[0] for x in rdkit.Chem.Descriptors._descList if (x[0].find("PartialCharge") == -1 and x[0].find("BCUT2D")==-1 and x[0].find("Morgan") == -1)]
    #print(des_name)  
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(des_name)
    mols["prop"] = mols["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return mols

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def getBestModel(X_train, y_train, X_test, y_test):
    from sklearn import pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.svm import SVC
    import numpy as np    
    pipe = pipeline.Pipeline([('classifier', RandomForestClassifier())])
    search_space = [{'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__max_features': [1, 2, 3]},
                    {'classifier': [LogisticRegression(solver='liblinear')],
                     'classifier__penalty': ['l1', 'l2'],
                     'classifier__C': np.logspace(0, 4, 10)},
                    {'classifier': [GradientBoostingClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__learning_rate': [0.001, 0.01, 0.1],
                     'classifier__max_depth': [1, 2, 3]},
                     {'classifier': [AdaBoostClassifier()]},
                        {'classifier': [SVC()],
                        'classifier__C': [0.1, 1, 10],
                        'classifier__gamma': [1, 0.1, 0.01],
                        'classifier__kernel': ['rbf', 'poly', 'sigmoid']},
                    ]
    
    clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
    best_model = clf.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    assessModel(y_test, y_pred)
    return best_model

def getBestfeatures(X,y):
    from sklearn.feature_selection import SelectKBest, mutual_info_classif
    X_new = SelectKBest(mutual_info_classif, k=45).fit_transform(X, y)
    print(X_new.shape)
    return X_new

def getBestFeatureSFS(X,y):
    from sklearn.feature_selection import SequentialFeatureSelector
    from sklearn.ensemble import RandomForestClassifier
    sfs_fwd = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=30, direction='backward').fit(X, y)
    X_new = sfs_fwd.transform(X)
    print(X_new.shape)
    return X_new


X_fp = calcFingerprint(dataMol)
X = X_fp["MolFP"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

X_prop = calcProp(dataMol)
X = X_prop["prop"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

0.5833333333333334
0.9074074074074074
[[91  2]
 [ 8  7]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        93
           1       0.78      0.47      0.58        15

    accuracy                           0.91       108
   macro avg       0.85      0.72      0.77       108
weighted avg       0.90      0.91      0.90       108

(357, 45)
0.72
0.9351851851851852
[[92  1]
 [ 6  9]]
              precision    recall  f1-score   support

           0       0.94      0.99      0.96        93
           1       0.90      0.60      0.72        15

    accuracy                           0.94       108
   macro avg       0.92      0.79      0.84       108
weighted avg       0.93      0.94      0.93       108



45 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 

0.21052631578947367
0.8611111111111112
[[91  2]
 [13  2]]
              precision    recall  f1-score   support

           0       0.88      0.98      0.92        93
           1       0.50      0.13      0.21        15

    accuracy                           0.86       108
   macro avg       0.69      0.56      0.57       108
weighted avg       0.82      0.86      0.82       108

(357, 45)
0.2857142857142857
0.8611111111111112
[[90  3]
 [12  3]]
              precision    recall  f1-score   support

           0       0.88      0.97      0.92        93
           1       0.50      0.20      0.29        15

    accuracy                           0.86       108
   macro avg       0.69      0.58      0.60       108
weighted avg       0.83      0.86      0.83       108



' X_new = getBestFeatureSFS(X, y)\nX_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   \ngetBestModel(X_train, y_train, X_test, y_test) '

In [None]:
import rdkit
from rdkit.Chem import PandasTools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rdkit

stFile = r"C:\Users\am7574\OneDrive - Corteva\Documents\Projects\AI_chemistry\TK_AI\TK_ALL.sdf"
data = PandasTools.LoadSDF(stFile)
dataMol = data[["ROMol","IC50 uM"]]
dataMol = dataMol.dropna()
dataMol['activity'] = dataMol['IC50 uM'].apply(lambda x: 0 if x == 'NI' or float(x) >= 20 else 1)

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def calcFingerprint(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    dataMol["MolFP"] = sdData["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return dataMol

def calcProp(stFile: str) -> pd.DataFrame:
    from rdkit.Chem import PandasTools
    sdData = PandasTools.LoadSDF(stFile)
    dataMol = sdData[["ROMol"]]
    dataMol = sdData.dropna()
    from rdkit.ML.Descriptors import MoleculeDescriptors
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in rdkit.Chem.Descriptors._descList])
    dataMol["prop"] = sdData["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return dataMol

def calcFingerprint(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.Chem import rdFingerprintGenerator
    mfpGen = rdFingerprintGenerator.GetMorganGenerator()
    mols["MolFP"] = mols["ROMol"].apply(lambda x:mfpGen.GetCountFingerprintAsNumPy(x))
    return mols

def calcProp(mols: pd.DataFrame) -> pd.DataFrame:
    from rdkit.ML.Descriptors import MoleculeDescriptors

    des_name = [x[0] for x in rdkit.Chem.Descriptors._descList if (x[0].find("PartialCharge") == -1 and x[0].find("BCUT2D")==-1 and x[0].find("Morgan") == -1)]
    #print(des_name)  
    calc = MoleculeDescriptors.MolecularDescriptorCalculator(des_name)
    mols["prop"] = mols["ROMol"].apply(lambda x:calc.CalcDescriptors(x))
    return mols

def assessModel(y_test, y_pred):
    from sklearn.metrics import f1_score
    print(f1_score(y_test, y_pred))
    from sklearn.metrics import accuracy_score
    print(accuracy_score(y_test, y_pred))
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, y_pred))
    from sklearn.metrics import classification_report
    print(classification_report(y_test, y_pred))

def getBestModel(X_train, y_train, X_test, y_test):
    from sklearn import pipeline
    from sklearn.model_selection import GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.svm import SVC
    import numpy as np    
    pipe = pipeline.Pipeline([('classifier', RandomForestClassifier())])
    search_space = [{'classifier': [RandomForestClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__max_features': [1, 2, 3]},
                    {'classifier': [LogisticRegression(solver='liblinear')],
                     'classifier__penalty': ['l1', 'l2'],
                     'classifier__C': np.logspace(0, 4, 10)},
                    {'classifier': [GradientBoostingClassifier()],
                     'classifier__n_estimators': [10, 100, 1000],
                     'classifier__learning_rate': [0.001, 0.01, 0.1],
                     'classifier__max_depth': [1, 2, 3]},
                     {'classifier': [AdaBoostClassifier()]},
                        {'classifier': [SVC()],
                        'classifier__C': [0.1, 1, 10],
                        'classifier__gamma': [1, 0.1, 0.01],
                        'classifier__kernel': ['rbf', 'poly', 'sigmoid']},
                    ]
    
    clf = GridSearchCV(pipe, search_space, cv=5, verbose=0, n_jobs=-1)
    best_model = clf.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    assessModel(y_test, y_pred)
    return best_model

def getBestfeatures(X,y):
    from sklearn.feature_selection import SelectKBest, mutual_info_classif
    X_new = SelectKBest(mutual_info_classif, k=50).fit_transform(X, y)
    print(X_new.shape)
    return X_new

def getBestFeatureSFS(X,y):
    from sklearn.feature_selection import SequentialFeatureSelector
    from sklearn.ensemble import RandomForestClassifier
    sfs_fwd = SequentialFeatureSelector(RandomForestClassifier(), n_features_to_select=30, direction='backward').fit(X, y)
    X_new = sfs_fwd.transform(X)
    print(X_new.shape)
    return X_new


X_fp = calcFingerprint(dataMol)
X = X_fp["MolFP"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

X_prop = calcProp(dataMol)
X = X_prop["prop"].to_list()
y = dataMol['activity'].to_list()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test)
X_new = getBestfeatures(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)
getBestModel(X_train, y_train, X_test, y_test)
""" X_new = getBestFeatureSFS(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   
getBestModel(X_train, y_train, X_test, y_test) """

  _data = np.array(data, dtype=dtype, copy=copy,


0.5833333333333334
0.9074074074074074
[[91  2]
 [ 8  7]]
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        93
           1       0.78      0.47      0.58        15

    accuracy                           0.91       108
   macro avg       0.85      0.72      0.77       108
weighted avg       0.90      0.91      0.90       108

(357, 50)
0.7333333333333333
0.9259259259259259
[[89  4]
 [ 4 11]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        93
           1       0.73      0.73      0.73        15

    accuracy                           0.93       108
   macro avg       0.85      0.85      0.85       108
weighted avg       0.93      0.93      0.93       108



45 fits failed out of a total of 420.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\miniforge3\envs\chem\Lib\site-packages\sklearn\base.py", line 

0.21052631578947367
0.8611111111111112
[[91  2]
 [13  2]]
              precision    recall  f1-score   support

           0       0.88      0.98      0.92        93
           1       0.50      0.13      0.21        15

    accuracy                           0.86       108
   macro avg       0.69      0.56      0.57       108
weighted avg       0.82      0.86      0.82       108

(357, 50)
0.21052631578947367
0.8611111111111112
[[91  2]
 [13  2]]
              precision    recall  f1-score   support

           0       0.88      0.98      0.92        93
           1       0.50      0.13      0.21        15

    accuracy                           0.86       108
   macro avg       0.69      0.56      0.57       108
weighted avg       0.82      0.86      0.82       108



  _data = np.array(data, dtype=dtype, copy=copy,


' X_new = getBestFeatureSFS(X, y)\nX_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)   \ngetBestModel(X_train, y_train, X_test, y_test) '