In [1]:
import deepchem as dc
import pandas as pd
import tempfile
from typing import Union
from datetime import datetime
import os
import shutil
import torch
import glob
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, cohen_kappa_score
from chembl_webresource_client.new_client import new_client

2023-12-05 20:22:54.645439: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/payam/.virtualenvs/nabi/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
def data_training(path: str, label: Union[int, str]) -> pd.DataFrame:
    """Loads and combines CSV files from a directory, assigns class labels, 
    removes duplicates, and returns a DataFrame.

    Args:
        path (str): Directory path containing the CSV files.
        label (Union[int, str]): Label value to be assigned as class 1.

    Returns:
        pd.DataFrame: DataFrame containing the 'smiles' column and class labels.

    Raises:
        ValueError: If the label is not in the valid range.
    """
    # Number of CSV files in the directory
    number_of_labels = len(glob.glob(f'{path}/*.csv'))

    # Convert label to string and pad with zeros if necessary
    label_str = str(label).zfill(2)

    # Validate label
    if len(label_str) > 2 or not 0 <= int(label_str) <= number_of_labels:
        raise ValueError(f"The label must be between 0 and {number_of_labels}.")

    # Label file path
    label_file = f"{path}/00_label_{label_str}.csv"

    # List all CSV files in the directory
    csv_files = glob.glob(f'{path}/*.csv')

    # Remove the label file from the list
    csv_files.remove(label_file)

    # Load the label file into a DataFrame and assign class 1
    df_positive = pd.read_csv(label_file)[['smiles']]
    if not df_positive.empty:
        df_positive['y'] = 1

        # Load the other CSV files into a DataFrame and assign class 0
        df_negative = pd.concat([pd.read_csv(file)[['smiles']] for file in csv_files])
        df_negative['y'] = 0

        # Combine the two DataFrames
        df = pd.concat([df_positive, df_negative])

        # Remove duplicates
        df.drop_duplicates(subset='smiles', inplace=True)

    else:
        df = pd.DataFrame()
    
    return df


def get_smiles_from_drug_names(drug_list, new_client, return_type='list'):
    molecule = new_client.molecule
    smiles_list = []
    drug_names = []

    for drug in drug_list:
        res = molecule.search(drug)

        if res:
            smiles_list.append(res[0]['molecule_structures']['canonical_smiles'])
            drug_names.append(drug)

    # Check return type
    if return_type == 'dataframe':
        # Convert list to DataFrame
        smiles_df = pd.DataFrame({
            'drug_name': drug_names,
            'smiles': smiles_list
        })
        return smiles_df
    elif return_type == 'list':
        return smiles_list
    else:
        raise ValueError("Invalid return_type. Expected 'dataframe' or 'list'.")

In [3]:
drug_list_to_exclude = [
    "Amodiaquine","Crizotinib","Mebhydrolin","Harringtonin",
    "Fluphenazine","Fingolimod","Fendiline","Dronedarone",
    "Perphenazine","Pimavanserin","Prochlorperazine","Raloxifene",
    "Abemaciclib","Mefloquine","Revaprazan","Melitracen",
    "Nelfinavir","Nicardipine","Nilotinib","Olmutinib",
    "harringtonin","Terconazole","Thioridazine","Thiothixene",
    "Tilorone-dihydrochloride","Triflupromazine"
]

In [4]:
smiles_list = get_smiles_from_drug_names(drug_list_to_exclude,
                                         new_client, return_type='list')


In [5]:
smiles_dataframe = get_smiles_from_drug_names(drug_list_to_exclude,
                                              new_client, return_type='dataframe')
to_replace = 'CN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1.O=S(=O)(O)CCS(=O)(=O)O'
replacement = 'CN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1'

# Replace in the entire DataFrame
smiles_dataframe = smiles_dataframe.replace(to_replace, replacement)

In [6]:
smiles_dataframe.iloc[10][1]

'CN1CCN(CCCN2c3ccccc3Sc3ccc(Cl)cc32)CC1'

In [7]:
now = datetime.now()
folder_name = now.strftime("%Y_%b_%d") + str('_deepchem_code_classifier')
folder_name = folder_name.lower()
if os.path.isdir(folder_name):
    shutil.rmtree(folder_name)
os.makedirs(folder_name)

df_label = pd.read_csv('label-all-new.csv', header=None)
df_label.reset_index(inplace=True)
df_label.rename(columns={'index': 'label', 0: 'location'}, inplace=True)
df_label = df_label[['label', 'location']]
result_dict = {}


In [8]:
number_of_files = len([f for f in os.listdir('final_label_to_train_LV') if os.path.isfile(os.path.join('final_label_to_train_LV', f))])
smiles_prediction_results = {
                                    'index':[],
                                    'smile':[],
                                    'label_location':[],
                                    'class0_proba':[],
                                    'class1_proba':[],
                                    'class':[],
                                    
}

for label_location in range(number_of_files):
    print(label_location)
    df = data_training('final_label_to_train_LV', label_location)
    if not df.empty:
        df = df[~df['smiles'].isin(smiles_list)]
        location = df_label.loc[df_label['label'] == label_location, 'location'].values[0]
    if not df.empty:
        temp_file = tempfile.NamedTemporaryFile(suffix=".csv", delete=False)
        df.to_csv(temp_file.name, index=False)

        featurizer = dc.feat.ConvMolFeaturizer()

        loader = dc.data.CSVLoader(tasks=['y'], feature_field="smiles", featurizer=featurizer)
        dataset = loader.create_dataset(temp_file.name)

        splitter = dc.splits.RandomSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)

        model = dc.models.GraphConvModel(n_tasks=1, mode='classification', dropout=0.2, model_dir='.')
        model.fit(train_dataset, nb_epoch=100)
        date_str = datetime.now().strftime('%b_%Y_%d')  # e.g., Nov_2023_16
        model_dir = f'deepchem_models/deepchem_{date_str}'
        os.makedirs(model_dir, exist_ok=True)
        model_path = f'{model_dir}/deepchem_{label_location}_{date_str}'
#         roc_auc_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
#         print("Training set AUC:", model.evaluate(train_dataset, [roc_auc_metric], transformers=[]))
#         print("Validation set AUC:", model.evaluate(valid_dataset, [roc_auc_metric], transformers=[]))

        predictions = model.predict(test_dataset)
        predicted_classes = np.argmax(predictions, axis=2).flatten()

        
        
        true_labels = test_dataset.y.flatten()

        for index, new_smile in enumerate(smiles_list):
            featurizer_new_drug = dc.feat.ConvMolFeaturizer()
            features_new_drug = featurizer_new_drug.featurize([new_smile])
            
            # Create a DeepChem dataset from the featurized data
            new_dataset_new_drug = dc.data.NumpyDataset(X=features_new_drug)
            
            # Predict the class
            # Assuming 'model' is already trained as in your provided code
            predictions_new_drug = model.predict(new_dataset_new_drug)
            predicted_class_new_drug = np.argmax(predictions_new_drug, axis=2).flatten()
            # print("predictions_new_drug:", predictions_new_drug)
            # print("predicted_class_new_drug:", predicted_class_new_drug)
            # print("Predicted Class:", predicted_class_new_drug[0])
            
            

            smiles_prediction_results['index'].append(index)
            smiles_prediction_results['smile'].append(new_smile)
            smiles_prediction_results['label_location'].append(label_location)
            smiles_prediction_results['class0_proba'].append(predictions_new_drug[0][0][0])
            smiles_prediction_results['class1_proba'].append(predictions_new_drug[0][0][1])
            smiles_prediction_results['class'].append(predicted_class_new_drug[0])



        
    
        f1_overall = f1_score(true_labels, predicted_classes, average='binary')
        f1_class_0 = f1_score(true_labels, predicted_classes, pos_label=0)
        f1_class_1 = f1_score(true_labels, predicted_classes, pos_label=1)
        accuracy = accuracy_score(true_labels, predicted_classes)
        cohen_kappa = cohen_kappa_score(true_labels, predicted_classes)

        print(f"F1 Score (Overall): {f1_overall}")
        print(f"F1 Score (Class 0): {f1_class_0}")
        print(f"F1 Score (Class 1): {f1_class_1}")
        print(f"Accuracy: {accuracy}")
        unique_key = f"deepchem_{label_location}"
        result_dict[unique_key] = {
                                'method': 'deepchem',
                                'location': location,
                                
                                'accuracy': accuracy,
                                'kappa': cohen_kappa,
                                'f1_score': f1_overall,
                                'f1_score_class_0': f1_class_0,
                                'f1_score_class_1': f1_class_1,
                                'label_1': len(df[df['y']==1]),
                                'label_0': len(df[df['y']==0]),
        
                            }
        temp_file.close()
        

0
1


2023-12-05 20:23:35.929243: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7368 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:02:00.0, compute capability: 6.1
2023-12-05 20:23:35.929843: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 7369 MB memory:  -> device: 1, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:82:00.0, compute capability: 6.1
2023-12-05 20:23:46.249727: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f115c0375b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-05 20:23:46.249849: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1080, Compute Capability 6.1
2023-12-05 20:23:46.249863: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA GeForce GTX 10

F1 Score (Overall): 0.38188976377952755
F1 Score (Class 0): 0.8796934865900383
F1 Score (Class 1): 0.38188976377952755
Accuracy: 0.7985888389993585
2
F1 Score (Overall): 0.3862520458265139
F1 Score (Class 0): 0.8504188272836058
F1 Score (Class 1): 0.3862520458265139
Accuracy: 0.7594611930724824
3
F1 Score (Overall): 0.18918918918918923
F1 Score (Class 0): 0.9802890932982917
F1 Score (Class 1): 0.18918918918918923
Accuracy: 0.9615137908915972
4
F1 Score (Overall): 0.4390832328106152
F1 Score (Class 0): 0.7968545216251639
F1 Score (Class 1): 0.4390832328106152
Accuracy: 0.7017318794098781
5
F1 Score (Overall): 0.8042462845010615
F1 Score (Class 0): 0.3958060288335518
F1 Score (Class 1): 0.8042462845010615
Accuracy: 0.7042976266837716
6
F1 Score (Overall): 0.4022662889518414
F1 Score (Class 0): 0.8250414593698177
F1 Score (Class 1): 0.4022662889518414
Accuracy: 0.7293136626042335
7
F1 Score (Overall): 0.434640522875817
F1 Score (Class 0): 0.8619313647246609
F1 Score (Class 1): 0.434640522

In [10]:
type(model)

deepchem.models.graph_models.GraphConvModel

In [11]:
df_result = pd.DataFrame.from_dict(result_dict, orient='index')


In [17]:
df_result

Unnamed: 0,method,location,accuracy,kappa,f1_score,f1_score_class_0,f1_score_class_1,label_1,label_0
deepchem_1,deepchem,cell_junctions,0.798589,0.26415,0.38189,0.879693,0.38189,2996,12594
deepchem_2,deepchem,centrosome,0.759461,0.245646,0.386252,0.850419,0.386252,3668,11922
deepchem_3,deepchem,cytoplasmic_bodies,0.961514,0.169537,0.189189,0.980289,0.189189,393,15197
deepchem_4,deepchem,cytoskeleton,0.701732,0.236189,0.439083,0.796855,0.439083,4207,11383
deepchem_5,deepchem,cytosol,0.704298,0.228841,0.804246,0.395806,0.804246,10380,5210
deepchem_6,deepchem,endoplasmic_reticulum,0.729314,0.228107,0.402266,0.825041,0.402266,3225,12365
deepchem_7,deepchem,endosome,0.778063,0.302279,0.434641,0.861931,0.434641,3802,11788
deepchem_8,deepchem,endosome_membrane,0.797947,0.236657,0.355828,0.880183,0.355828,2583,13007
deepchem_9,deepchem,er_lumen,0.92431,0.375379,0.415842,0.959534,0.415842,1039,14551
deepchem_10,deepchem,er_membrane,0.691469,0.278229,0.495278,0.777829,0.495278,5539,10051


In [13]:
predictions_new_drug.shape

(1, 1, 2)

In [18]:
df_result.to_csv('ZZ-deepchem.csv', index = False)

In [15]:
df_smiles_prediction_results = pd.DataFrame.from_dict(smiles_prediction_results)
df_smiles_prediction_results

Unnamed: 0,index,smile,label_location,class0_proba,class1_proba,class
0,0,CCN(CC)Cc1cc(Nc2ccnc3cc(Cl)ccc23)ccc1O,1,0.872882,0.127118,0
1,1,C[C@@H](Oc1cc(-c2cnn(C3CCNCC3)c2)cnc1N)c1c(Cl)...,1,0.204842,0.795158,1
2,2,CN1CCc2c(c3ccccc3n2Cc2ccccc2)C1,1,0.733403,0.266597,0
3,3,COC(=O)C[C@@](O)(CCC(C)(C)O)C(=O)O[C@@H]1C(OC)...,1,0.869752,0.130248,0
4,4,OCCN1CCN(CCCN2c3ccccc3Sc3ccc(C(F)(F)F)cc32)CC1,1,0.571361,0.428639,0
...,...,...,...,...,...,...
853,21,CC(C)N1CCN(c2ccc(OC[C@H]3CO[C@](Cn4cncn4)(c4cc...,33,0.785366,0.214634,0
854,22,CSc1ccc2c(c1)N(CCC1CCCCN1C)c1ccccc1S2,33,0.960022,0.039978,0
855,23,CN1CCN(CC/C=C2/c3ccccc3Sc3ccc(S(=O)(=O)N(C)C)c...,33,0.977273,0.022727,0
856,24,CCN(CC)CCOc1ccc2c(c1)C(=O)c1cc(OCCN(CC)CC)ccc1-2,33,0.995796,0.004204,0


In [16]:
df_smiles_prediction_results.to_csv('ZZ-smiles_prediction_results.csv', index=False)

In [12]:
predictions_new_drug[0][0][1]

0.01864652

In [9]:
df_result

NameError: name 'df_result' is not defined

In [13]:
result_dict

{'deepchem_1': {'method': 'deepchem',
  'location': 'cell_junctions',
  'accuracy': 0.7844772289929441,
  'kappa': 0.2325800128336981,
  'f1_score': 0.36121673003802285,
  'f1_score_class_0': 0.8703703703703703,
  'f1_score_class_1': 0.36121673003802285,
  'label_1': 2996,
  'label_0': 12594},
 'deepchem_2': {'method': 'deepchem',
  'location': 'centrosome',
  'accuracy': 0.7575368826170622,
  'kappa': 0.248383385668698,
  'f1_score': 0.39616613418530355,
  'f1_score_class_0': 0.848314606741573,
  'f1_score_class_1': 0.39616613418530355,
  'label_1': 3668,
  'label_0': 11922},
 'deepchem_3': {'method': 'deepchem',
  'location': 'cytoplasmic_bodies',
  'accuracy': 0.9634381013470174,
  'kappa': 0.22129919293356815,
  'f1_score': 0.24000000000000002,
  'f1_score_class_0': 0.9812684850476504,
  'f1_score_class_1': 0.24000000000000002,
  'label_1': 393,
  'label_0': 15197},
 'deepchem_4': {'method': 'deepchem',
  'location': 'cytoskeleton',
  'accuracy': 0.6966003848620911,
  'kappa': 0.21