#### You just fill in the blank spaces (where it says code). You don't need to touch the other parts.

### You should submit an ipynb file with the output: "Project_{id}_{name}.ipynb"
### 📧If you have any question, you can ask by e-mail: sisifhro@kaist.ac.kr

In [225]:
#@title Install rdkit
!apt-get install -y python-rdkit librdkit1 rdkit-data
!pip install rdkit

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
E: Unable to locate package python-rdkit


In [226]:
#@title Import package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

import copy
import csv
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### ClinTox Dataset
We will classify ClinTox dataset which compares drugs approved by the FDA and drugs that have failed clinical trials for toxicity reasons.
You can download this datset with the code below.

In [227]:
'''
The code for download ClinTox dataset. The folder 'clintox' would be generated.
In the 'clintox; folder, the files of clintox_train.csv, clintox_valid.csv, clintox_test.csv would exist.
'''
!git clone https://huggingface.co/datasets/zpn/clintox

fatal: destination path 'clintox' already exists and is not an empty directory.


In [228]:
#@title Load data
'''
In Practice 3, we separated the data into train, valid, and test through 'train_test_split'.
However, since the downloaded data is already divided into train, valid, and, test you can just load each data.
'''
train_data = pd.read_csv('clintox/clintox_train.csv')
valid_data = pd.read_csv('clintox/clintox_valid.csv')
test_data = pd.read_csv('clintox/clintox_test.csv')

### 1. Extract ECFP

Extract ECFP from clintox dataset. You should define 'get ECFP' function which  ECFP list for each smile.

In [229]:
def get_ecfp(smiles):
    '''
    input: (string) SMILES
    output: (list) ECFP fingerprint list
    '''
    ################ Code ################
    mol=Chem.MolFromSmiles(smiles)
    ecfp=GetMorganFingerprintAsBitVect(mol,radius=2,nBits=1024)
    ecfp_list=list(map(int,ecfp.ToBitString()))
    return ecfp_list
    ######################################

In [230]:
'''
In this section, you should specify variables (X_train, X_valid, X_test, y_train, y_valid, y_test)

X_train, X_valid, X_test: (list) all ECFP fingerprint list for all SMILES
y_train, y_valid, y_test = (list) all target value (0 or 1) list for all SMILES
'''
################ Code ################
X_train=list()
for smiles in train_data['smiles']:
    X_train.append(get_ecfp(smiles))

X_valid=list()
for smiles in valid_data['smiles']:
    X_valid.append(get_ecfp(smiles))

X_test=list()
for smiles in test_data['smiles']:
    X_test.append(get_ecfp(smiles))

y_train=train_data['target'].tolist()
y_valid=valid_data['target'].tolist()
y_test=test_data['target'].tolist()
######################################

### 2. Machine Learning model

You should construct 2 classification model (SVM and RF) to predict whether the drugs are toxic (0 of 1). You can train the model from the ECFP. You will get full credit if you get accuracy of test dataset above 0.92 for each model.

In [231]:
# SVC
################ Code ################
from sklearn.svm import SVC
svm=SVC(kernel='linear',C=1.0,random_state=1)
svm.fit(X_train, y_train)
######################################

In [232]:
print(svm.score(X_train, y_train))
print(svm.score(X_test, y_test))

0.9805249788314987
0.9256756756756757


In [233]:
# RF
################ Code ################
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,random_state=1)
rf.fit(X_train, y_train)
######################################

In [234]:
print(rf.score(X_train,y_train))
print(rf.score(X_test,y_test))

0.9822184589331076
0.9324324324324325


### 3. Pytorch Regression

You should construct classification for classifying whether the drugs are toxic (0 or 1). You should use 3 linear layers and ReLU activation layers. You can modify learning rate, batch size, and num_epochs, and hidden dimension. You will get full credit if you get accuracy of dataset above 0.92 for the model and satisfy above condition.

In [235]:
#@title Define Dataset

class SMILESDataset(Dataset):
    def __init__(self, data):
        '''
        input : dataframe

        You can specify self.X, self.y
        self.X: An array that extracted ECFP for all smiles
        self.y: An array of target values for all smiles
        '''

        ################ Code ################
        self.data=data
        ecfp_vectors=list()
        for smiles in data['smiles']:
            ecfp=get_ecfp(smiles)
            ecfp_vectors.append(ecfp)
        self.X=torch.tensor(ecfp_vectors)
        self.y=torch.tensor(data['target'].values)
        ######################################


    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        data_dict = dict()
        data_dict['X'] = self.X[idx]
        data_dict['y'] = self.y[idx]
        return data_dict

In [236]:
train_dataset = SMILESDataset(train_data)
valid_dataset = SMILESDataset(valid_data)
test_dataset = SMILESDataset(test_data)

In [237]:
#@title Define Dataloader
'''
You can modify batch_size.
'''
################ Code ################
batch_size=64
######################################

train_data_loader = DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
val_data_loader = DataLoader(dataset = valid_dataset, batch_size = batch_size, shuffle = False)
test_data_loader = DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle = False)

In [238]:
#@title Define Model (Classifier)
class Classifier(nn.Module):
################ Code ################
    def __init__(self,in_dim,hid_dim,out_dim):
        super(Classifier,self).__init__()
        self.linear1=nn.Linear(in_dim,hid_dim)
        self.linear2=nn.Linear(hid_dim,hid_dim)
        self.linear3=nn.Linear(hid_dim,out_dim)
        self.relu=nn.ReLU()

    def forward(self,x):
        x =self.relu(self.linear1(x))
        x=self.relu(self.linear2(x))
        x=self.linear3(x)
        return x
    ######################################

In [239]:
#@title Define Hyperparameters
'''
You should set hyperparameters. you can modify the learning rate and hidden dimensions.

'''
################ Code ################
in_dim=1024
hid_dim=512
out_dim=2
lr=1e-3
num_epoch=100
######################################

In [240]:
'''
You should specify model.

'''
################ Code ################
model=Classifier(in_dim=in_dim,hid_dim=hid_dim,out_dim=out_dim)
######################################
crossentropy_loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [241]:
#@title Train
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)


num_train_data = len(train_dataset)
num_val_data = len(valid_dataset)

train_loss_history = []
val_loss_history = []

best_train_loss = 99999999
best_val_loss = 99999999

print(f"---------- {num_epoch} times train and test START! ----------")
print(f"Model name: \t\t{type(model).__name__}")
print()
print()


# model.cuda()   # Turn on when you want to use GPU
start_time = time.time()
for epoch in range(1, num_epoch+1):

    # Train
    model.train()
    train_loss = []
    for i, data in enumerate(train_data_loader):
        x = data['X'].float()
        label_true = data['y'].long()
        # x = x.cuda()  # Turn on when you want to use GPU
        # label_true = label_true.cuda()  # Turn on when you want to use GPU
        optimizer.zero_grad()
        label_pred = model(x)

        loss = crossentropy_loss(label_pred, label_true)
        loss.backward()
        optimizer.step()

        loss = copy.deepcopy(loss.data.cpu().numpy())
        train_loss.append(loss)
    train_mean_loss = np.sum(train_loss)/num_train_data
    train_loss_history.append(train_mean_loss)

    if best_train_loss > train_mean_loss:
        best_train_epoch = epoch
        best_train_loss = train_mean_loss
        best_train_model = copy.deepcopy(model.state_dict())

    if epoch == 0:
        print(f"epoch time: {time.time()-start_time:.2f}")

    if epoch % 10 == 0 or epoch == num_epoch:
            print(f"epoch: {epoch} / {num_epoch} epoch \t loss: {train_mean_loss:.4f}")

    # Validation
    model.eval()
    val_loss = []
    with torch.no_grad():
        for i, data in enumerate(val_data_loader):
            x = data['X'].float()
            label_true = data['y'].long()
            # x = x.cuda()  # Turn on when you want to use GPU
            # label_true = label_true.cuda()  # Turn on when you want to use GPU

            label_pred = model(x)

            loss = crossentropy_loss(label_pred, label_true)

            loss = copy.deepcopy(loss.data.cpu().numpy())
            val_loss.append(loss)
        val_mean_loss = np.sum(val_loss)/num_val_data
        val_loss_history.append(val_mean_loss)


        if best_val_loss > val_mean_loss:
            best_val_epoch = epoch + 1
            best_val_loss = val_mean_loss
            best_val_model = copy.deepcopy(model.state_dict())

            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,

            }, 'best_model.ckpt')

print()
print("---------- TRAIN RESULTS ----------")
print(f"Best epoch \t: {best_train_epoch}")
print(f"Best loss \t: {best_train_loss:.4f}")
print()

print("---------- VALIDATION RESULTS ----------")
print(f"Best epoch \t: {best_val_epoch}")
print(f"Best loss \t: {best_val_loss:.4f}")


---------- 100 times train and test START! ----------
Model name: 		Classifier


epoch: 10 / 100 epoch 	 loss: 0.0008
epoch: 20 / 100 epoch 	 loss: 0.0004
epoch: 30 / 100 epoch 	 loss: 0.0004
epoch: 40 / 100 epoch 	 loss: 0.0005
epoch: 50 / 100 epoch 	 loss: 0.0004
epoch: 60 / 100 epoch 	 loss: 0.0004
epoch: 70 / 100 epoch 	 loss: 0.0004
epoch: 80 / 100 epoch 	 loss: 0.0004
epoch: 90 / 100 epoch 	 loss: 0.0005
epoch: 100 / 100 epoch 	 loss: 0.0004

---------- TRAIN RESULTS ----------
Best epoch 	: 89
Best loss 	: 0.0004

---------- VALIDATION RESULTS ----------
Best epoch 	: 2
Best loss 	: 0.0046


In [242]:
#@title Test
# checkpoint = torch.load('best_model.ckpt')
# model = Classifier(('you should write parameters'))
# optimizer = optim.Adam(model.parameters(), lr=lr)
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])


model.eval()
test_loss, pred_list, label_list = [], [], []
with torch.no_grad():
    for i, data in enumerate(test_data_loader):
        x = data['X'].float()
        label_true = data['y'].long()
        # x = x.cuda()  # Turn on when you want to use GPU
        # label_true = label_true.cuda()  # Turn on when you want to use GPU

        label_pred = model(x)
        _,pred = label_pred.cpu().topk(1,dim=1)
        pred = pred.reshape(-1)

        pred_list.append(pred)
        label_list.append(label_true)

        loss = crossentropy_loss(label_pred, label_true)

        loss = copy.deepcopy(loss.data.cpu().numpy())
        test_loss.append(loss)

all_pred_list = torch.concat(pred_list)
all_label_list = torch.concat(label_list)
print(f'accuracy: {accuracy_score(all_label_list, all_pred_list)} ')

accuracy: 0.9391891891891891 
