!pip install rdkit-pypi

In [2]:
# import statements
from rdkit import Chem
from rdkit.Chem import DataStructs, AllChem

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # Functipn to split data into training, validation and test sets
import pickle
import glob   # The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order. No tilde expansion is done, but *, ?, and character ranges expressed with [] will be correctly matched.
import os   # miscellneous operating system interfaces. This module provides a portable way of using operating system dependent functionality. If you just want to read or write a file see open(), if you want to manipulate paths, see the os.path module, and if you want to read all the lines in all the files on the command line see the fileinput module.
import random       
from tqdm import tqdm 
from tqdm.notebook import tqdm_notebook
import datetime

# Torch
import torch
from torchvision import transforms
import torchvision.models as models
import torch.nn as nn




In [3]:
now = datetime.datetime.now()
now = now.strftime("%d_%m_%Y-%H:%M:%S")

# Pseudo Code
1. download train, validation and test data
2. keep only columns with relevant data (compound name, smile strings, moa)
    1. assert that dmso not in all data values
3. create function that gets produces a dictionary with all relevant moas and assigns a number to them
    1. extract unique values
    2. enumerate loop, add to growing dictionary, where name is the key.
4. set these values as moa
5. do one hot encoding
6. Fix torch_set so that it can handle pandas instead of an extra dictionary 

In [4]:
compounds_v1v2 = pd.read_csv('/home/jovyan/Tomics-CP-Chem-MoA/data_for_models/compounds_v1v2.csv', delimiter = ",")
    

In [5]:
# testing using pandas dataframe
training_set = pd.read_csv('/home/jovyan/Tomics-CP-Chem-MoA/data_for_models/CS_data_splits/CS_training_set_cyclo_adr_2.csv')
validation_set = pd.read_csv('/home/jovyan/Tomics-CP-Chem-MoA/data_for_models/CS_data_splits/CS_valid_set_cyclo_adr_2.csv')
test_set = pd.read_csv('/home/jovyan/Tomics-CP-Chem-MoA/data_for_models/CS_data_splits/CS_test_set_cyclo_adr_2.csv')

In [14]:
test_set.moa.value_counts()

0    10
1     4
Name: moa, dtype: int64

In [10]:
with open('/home/jovyan/Tomics-CP-Chem-MoA/data_for_models/CS_data_splits/cyclo_adr_2_moa_dict.pickle', 'rb') as handle:
        moa_dict = pickle.load(handle)
moa_dict

{'cyclooxygenase inhibitor': 0, 'dopamine receptor antagonist': 1}

In [12]:
target= [None]*len(moa_dict)
for i in moa_dict.items():
    target[i[1]] = i[0]

In [13]:
target

['cyclooxygenase inhibitor', 'dopamine receptor antagonist']

In [6]:
def splitting_into_tensor(df, num_classes):
    '''Splitting data into two parts:
    1. input : the pointer showing where the transcriptomic profile is  
    2. target one hot : labels (the correct MoA) '''
    
    # one-hot encoding labels
     # creating tensor from all_data.df
    target = torch.tensor(df['moa'].values.astype(np.int64))

    # For each row, take the index of the target label
    # (which coincides with the score in our case) and use it as the column index to set the value 1.0.” 
    #target_onehot = torch.zeros(target.shape[0], num_classes)
    #target_onehot.scatter_(1, target.unsqueeze(1), 1.0)
    
    input =  df.drop('moa', axis = 1)
    
    return input, target #target_onehot

In [7]:
assert training_set.moa.unique().all() == validation_set.moa.unique().all() == test_set.moa.unique().all()

In [8]:
training_set.moa.unique()

array([0, 1])

In [474]:
num_classes = len(training_set.moa.unique())
num_classes

2

In [475]:
training_df, train_labels = splitting_into_tensor(training_set, num_classes)
validation_df, validation_labels = splitting_into_tensor(validation_set, num_classes)
test_df, test_labels = splitting_into_tensor(test_set, num_classes)

In [476]:
# got to put the labels into the same array as compounds_moa in order for the indexing to work. Biggest problem is I only have
# have 250 compounds to train, not 120000, which probabilty artificially gives a bunch of extra compounds I don't have

In [477]:
batch_size = 50
# parameters
params = {'batch_size' : batch_size,
         'num_workers' : 3,
         'shuffle' : True,
         'prefetch_factor' : 1} 
          
# shuffle isn't working

# Datasets
#partition = partition
#labels = labels

# maxepochs
max_epochs = 250

In [478]:
device = (torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))
#device = torch.device('cpu')
print(f'Training on device {device}. ' )

Training on device cpu. 


In [479]:
# A function changing SMILES to Morgan fingerprints 
def smiles_to_array(smiles):
    molecules = Chem.MolFromSmiles(smiles) 
    fingerprints = AllChem.GetMorganFingerprintAsBitVect(molecules, 2)
    x_array = []
    arrays = np.zeros(0,)
    DataStructs.ConvertToNumpyArray(fingerprints, arrays)
    x_array.append(arrays)
    x_array = np.asarray(x_array)
    x_array = ((np.squeeze(x_array)).astype(int)) 
    x_array = torch.from_numpy(x_array)
    return x_array                  

In [480]:
# create Torch.dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, compound_df, labels, transform=None):
        self.compound_labels = labels    # the entire length of the correct classes that we are trying to predict
        # print(self.img_labels)
        self.compound_df = compound_df        # list of indexes that are a part of training, validation, tes sets
        self.transform = transform       # any transformations done

    def __len__(self):
        ''' The number of data points'''
        return len(self.compound_labels)      

    def __getitem__(self, idx):
        '''Retrieving the compound '''
        #print(idx)
        smile_string = self.compound_df["SMILES"][idx]      # returns smiles by using compound as keys
        #print(smile_string)
        compound_array = smiles_to_array(smile_string)
        #print(f' return from function: {compound}')
        #print(f' matrix: {compound_array}')
        label = self.compound_labels[idx]             # extract classification using index
        #print(f' label: {label}')
        #label = torch.tensor(label, dtype=torch.float)
        if self.transform:                         # uses Albumentations image pipeline to return an augmented image
            compound = self.transform(compound)
        return compound_array.float(), label.long()

## Generators

In [481]:
# Create a dataset with all indices and labels
training_dataset = Dataset(training_df, train_labels)
valid_dataset = Dataset(validation_df, validation_labels)
test_dataset = Dataset(test_df, test_labels)

In [482]:
assert len(training_df) == len(train_labels)
assert len(validation_df) == len(validation_labels)
assert len(test_df) == len(test_labels)

In [483]:
# create generator that randomly takes indices from the training set
training_generator = torch.utils.data.DataLoader(training_dataset, **params)
validation_generator = torch.utils.data.DataLoader(valid_dataset, **params)
test_generator = torch.utils.data.DataLoader(test_dataset, **params)
# training_set = Dataset(partition["train"], labels)

In [484]:
#partition["train"]

In [485]:
# training data loader
# Display image and label   # functional
train_features, train_labels = next(iter(training_generator))
print(f"Feature batch shape: {train_features}")
print(f"Labels batch shape: {train_labels}")

Feature batch shape: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Labels batch shape: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0])


In [486]:
valid_features, valid_labels = next(iter(validation_generator))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([50, 2048])
Labels batch shape: torch.Size([50])


In [487]:
test_features, test_labels = next(iter(test_generator))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([50, 2048])
Labels batch shape: torch.Size([50])


In [488]:
# If applying class weights
apply_class_weights = True
if apply_class_weights:     # if we want to apply class weights
    counts = training_set.moa.value_counts()  # count the number of moa in each class for the ENTiRE dataset
    print(counts)
    class_weights = []   # create list that will hold class weights
    for moa in training_set.moa.unique():       # for each moa   
        #print(moa)
        counts[moa]
        class_weights.append(counts[moa])  # add counts to class weights
    print(len(class_weights))
    print(class_weights)
    print(type(class_weights))
    # class_weights = 1 / (class_weights / sum(class_weights)) # divide all class weights by total moas
    class_weights = [i / sum(class_weights) for  i in class_weights]
    class_weights= torch.tensor(class_weights,dtype=torch.float).to(device) # transform into tensor, put onto device
print(class_weights)

0    45
1    18
Name: moa, dtype: int64
2
[45, 18]
<class 'list'>
tensor([0.7143, 0.2857])


In [489]:
# Creating Architecture
units = 64
drop  = 0.7

seq_model = nn.Sequential(
    nn.Linear(2048, 128),
    nn.ReLU(),
    nn.Dropout(p = drop),
    nn.Linear(128,64),
    nn.ReLU(),
    nn.Linear(64, num_classes))

In [490]:
seq_model

Sequential(
  (0): Linear(in_features=2048, out_features=128, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.7, inplace=False)
  (3): Linear(in_features=128, out_features=64, bias=True)
  (4): ReLU()
  (5): Linear(in_features=64, out_features=2, bias=True)
)

In [491]:
# optimizer_algorithm
#cnn_optimizer = torch.optim.Adam(updated_model.parameters(),weight_decay = 1e-6, lr = 0.001, betas = (0.9, 0.999), eps = 1e-07)
optimizer = torch.optim.Adam(seq_model.parameters(), lr = 1e-4)
# loss_function
if apply_class_weights == True:
    loss_function = torch.nn.CrossEntropyLoss(class_weights)
else:
    loss_function = torch.nn.CrossEntropyLoss()

In [492]:
'''# complete the architecture of MLP and compile MLP 

units = 64  
drop = 0.89  

model_mlp = Sequential()
model_mlp.add(Dense(units, input_dim = 2048, activation = 'relu'))
model_mlp.add(Dropout(drop))
model_mlp.add(Dense(10, activation = 'softmax'))
model_mlp.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-4),
         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
         metrics = ['accuracy'])'''

"# complete the architecture of MLP and compile MLP \n\nunits = 64  \ndrop = 0.89  \n\nmodel_mlp = Sequential()\nmodel_mlp.add(Dense(units, input_dim = 2048, activation = 'relu'))\nmodel_mlp.add(Dropout(drop))\nmodel_mlp.add(Dense(10, activation = 'softmax'))\nmodel_mlp.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-4),\n         loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),\n         metrics = ['accuracy'])"

In [493]:
def training_loop(n_epochs, optimizer, model, loss_fn, train_loader, valid_loader):
    '''
    n_epochs: number of epochs 
    optimizer: optimizer used to do backpropagation
    model: deep learning architecture
    loss_fn: loss function
    train_loader: generator creating batches of training data
    valid_loader: generator creating batches of validation data
    '''
    # lists keep track of loss and accuracy for training and validation set
    #optimizer = torch.optim.Adam(updated_model.parameters(),weight_decay = 1e-6, lr = 0.001, betas = (0.9, 0.999), eps = 1e-07)
    train_loss_per_epoch = []
    train_acc_per_epoch = []
    val_loss_per_epoch = []
    val_acc_per_epoch = []
    best_val_loss = np.inf
    for epoch in tqdm(range(1, max_epochs +1), desc = "Epoch", position = 0, leave = True):
        loss_train = 0.0
        train_total = 0
        train_correct = 0
        for compounds, labels in train_loader:
            optimizer.zero_grad()
            # put model, images, labels on the same device
            model = model.to(device)
            compounds = compounds.to(device = device)
            labels = labels.to(device= device)
            #print(f' Compounds {compounds}')
            #print(f' Labels {labels}')
            
            #print(labels)
            # Training Model
            outputs = model(compounds)
            #print(f' Outputs : {outputs}') # tensor with 10 elements
            #print(f' Labels : {labels}') # tensor that is a number
            loss = loss_fn(outputs,labels)
            # Update weights
            loss.backward()
            optimizer.step()
            # Training Metrics
            loss_train += loss.item()
            #print(f' loss: {loss.item()}')
            train_predicted = torch.argmax(outputs, 1)
            #print(f' train_predicted {train_predicted}')
            # NEW
            #labels = torch.argmax(labels,1)
            #print(labels)
            train_total += labels.shape[0]
            train_correct += int((train_predicted == labels).sum())
        # validation metrics from batch
        val_correct, val_total, val_loss, best_val_loss_upd = validation_loop(model, loss_fn, valid_loader, best_val_loss, epoch)
        val_accuracy = val_correct/val_total
        # printing results for epoch
        if epoch == 1 or epoch %2 == 0:
            print(f' {datetime.datetime.now()} Epoch: {epoch}, Training loss: {loss_train/len(train_loader)}, Validation Loss: {val_loss} ')
        # adding epoch loss, accuracy to lists 
        val_loss_per_epoch.append(val_loss)
        train_loss_per_epoch.append(loss_train/len(train_loader))
        val_acc_per_epoch.append(val_accuracy)
        train_acc_per_epoch.append(train_correct/train_total)
    # return lists with loss, accuracy every epoch
    return train_loss_per_epoch, train_acc_per_epoch, val_loss_per_epoch, val_acc_per_epoch

In [494]:
def validation_loop(model, loss_fn, valid_loader, best_val_loss, epoch):
    '''
    Assessing trained model on valiidation dataset 
    model: deep learning architecture getting updated by model
    loss_fn: loss function
    valid_loader: generator creating batches of validation data
    '''
    loss_val = 0.0
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():  # does not keep track of gradients so as to not train on validation data.
        for compounds, labels in valid_loader:
            # Move to device MAY NOT BE NECESSARY
            model = model.to(device)
            compounds = compounds.to(device = device)
            labels = labels.to(device= device)
            # Assessing outputs
            outputs = model(compounds)
            #print(f' Outputs : {outputs}') # tensor with 10 elements
            #print(f' Labels : {labels}') # tensor that is a number
            loss = loss_fn(outputs,labels)
            loss_val += loss.item()
            predicted = torch.argmax(outputs, 1)
            #labels = torch.argmax(labels,1)
            #print(predicted)
            #print(labels)
            total += labels.shape[0]
            correct += int((predicted == labels).sum())
        avg_val_loss = loss_val/len(valid_loader)  # average loss over batch
        if best_val_loss > loss_val:
            best_val_loss = loss_val
            torch.save(
                {'epoch': epoch,
                    'model_state_dict' : model.state_dict(),
                    'valid_loss' : loss_val
            },  '/home/jovyan/Tomics-CP-Chem-MoA/01_CStructure_Models/saved_models/pre_split/' + 'ChemStruc_least_loss_model'
            )
    model.train()
    return correct, total, avg_val_loss, best_val_loss

In [495]:
def test_loop(model, loss_fn, test_loader):
    '''
    Assessing trained model on test dataset 
    model: deep learning architecture getting updated by model
    loss_fn: loss function
    test_loader: generator creating batches of test data
    '''
    loss_test = 0.0
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():  # does not keep track of gradients so as to not train on test data.
        for compounds, labels in tqdm_notebook(test_loader):
            # Move to device MAY NOT BE NECESSARY
            model = model.to(device)
            compounds = compounds.to(device = device)
            labels = labels.to(device= device)
            # Assessing outputs
            outputs = model(compounds)
            # print(f' Outputs : {outputs}') # tensor with 10 elements
            # print(f' Labels : {labels}') # tensor that is a number
            loss = loss_fn(outputs,labels)
            loss_test += loss.item()
            predicted = torch.argmax(outputs, 1)
            #labels = torch.argmax(labels,1)
            #print(predicted)
            #print(labels)
            total += labels.shape[0]
            correct += int((predicted == labels).sum())
        avg_test_loss = loss_test/len(test_loader)  # average loss over batch
    return correct, total, avg_test_loss

In [496]:
train_loss_per_epoch, train_acc_per_epoch, val_loss_per_epoch, val_acc_per_epoch = training_loop(n_epochs = max_epochs,
              optimizer = optimizer,
              model = seq_model,
              loss_fn = loss_function,
              train_loader=training_generator, 
              valid_loader=validation_generator)

Epoch:   0%|          | 1/250 [00:00<01:51,  2.24it/s]

 2023-02-06 16:47:32.816562 Epoch: 1, Training loss: 0.7589018940925598, Validation Loss: 0.7469367980957031 


Epoch:   1%|          | 2/250 [00:00<02:02,  2.02it/s]

 2023-02-06 16:47:33.346247 Epoch: 2, Training loss: 0.7542061507701874, Validation Loss: 0.7446339726448059 


Epoch:   2%|▏         | 4/250 [00:01<02:01,  2.02it/s]

 2023-02-06 16:47:34.341124 Epoch: 4, Training loss: 0.7548678517341614, Validation Loss: 0.7401986122131348 


Epoch:   2%|▏         | 6/250 [00:03<02:09,  1.88it/s]

 2023-02-06 16:47:35.456035 Epoch: 6, Training loss: 0.7468858957290649, Validation Loss: 0.7359644770622253 


Epoch:   3%|▎         | 8/250 [00:04<02:02,  1.97it/s]

 2023-02-06 16:47:36.425101 Epoch: 8, Training loss: 0.7347349524497986, Validation Loss: 0.7318773865699768 


Epoch:   4%|▍         | 10/250 [00:05<02:00,  1.98it/s]

 2023-02-06 16:47:37.427608 Epoch: 10, Training loss: 0.7235487401485443, Validation Loss: 0.7279866337776184 


Epoch:   5%|▍         | 12/250 [00:06<01:58,  2.01it/s]

 2023-02-06 16:47:38.412340 Epoch: 12, Training loss: 0.7285543382167816, Validation Loss: 0.7241320610046387 


Epoch:   6%|▌         | 14/250 [00:07<02:03,  1.91it/s]

 2023-02-06 16:47:39.498952 Epoch: 14, Training loss: 0.7282642126083374, Validation Loss: 0.7203192114830017 


Epoch:   6%|▋         | 16/250 [00:08<02:02,  1.91it/s]

 2023-02-06 16:47:40.542270 Epoch: 16, Training loss: 0.7187803983688354, Validation Loss: 0.7163724899291992 


Epoch:   7%|▋         | 18/250 [00:09<01:57,  1.98it/s]

 2023-02-06 16:47:41.514494 Epoch: 18, Training loss: 0.7030719518661499, Validation Loss: 0.7121397852897644 


Epoch:   8%|▊         | 20/250 [00:10<01:56,  1.97it/s]

 2023-02-06 16:47:42.537466 Epoch: 20, Training loss: 0.7131055295467377, Validation Loss: 0.7077369093894958 


Epoch:   9%|▉         | 22/250 [00:11<01:53,  2.01it/s]

 2023-02-06 16:47:43.513139 Epoch: 22, Training loss: 0.6984739005565643, Validation Loss: 0.7031105756759644 


Epoch:  10%|▉         | 24/250 [00:12<01:55,  1.96it/s]

 2023-02-06 16:47:44.555374 Epoch: 24, Training loss: 0.6909539699554443, Validation Loss: 0.6981858611106873 


Epoch:  10%|█         | 26/250 [00:13<01:52,  1.99it/s]

 2023-02-06 16:47:45.538084 Epoch: 26, Training loss: 0.6875430345535278, Validation Loss: 0.6927515864372253 


Epoch:  11%|█         | 28/250 [00:14<01:50,  2.01it/s]

 2023-02-06 16:47:46.523997 Epoch: 28, Training loss: 0.6815510094165802, Validation Loss: 0.686891496181488 


Epoch:  12%|█▏        | 30/250 [00:15<01:51,  1.98it/s]

 2023-02-06 16:47:47.549833 Epoch: 30, Training loss: 0.6753527820110321, Validation Loss: 0.6805431246757507 


Epoch:  13%|█▎        | 32/250 [00:16<01:47,  2.02it/s]

 2023-02-06 16:47:48.517476 Epoch: 32, Training loss: 0.6668086349964142, Validation Loss: 0.6733841300010681 


Epoch:  14%|█▎        | 34/250 [00:17<01:54,  1.89it/s]

 2023-02-06 16:47:49.620700 Epoch: 34, Training loss: 0.6635431051254272, Validation Loss: 0.6655620336532593 


Epoch:  14%|█▍        | 36/250 [00:18<01:49,  1.95it/s]

 2023-02-06 16:47:50.628158 Epoch: 36, Training loss: 0.6520528197288513, Validation Loss: 0.6571143269538879 


Epoch:  15%|█▌        | 38/250 [00:19<01:47,  1.97it/s]

 2023-02-06 16:47:51.638618 Epoch: 38, Training loss: 0.6383660435676575, Validation Loss: 0.6478897929191589 


Epoch:  16%|█▌        | 40/250 [00:20<01:45,  2.00it/s]

 2023-02-06 16:47:52.622790 Epoch: 40, Training loss: 0.6308471858501434, Validation Loss: 0.6379764080047607 


Epoch:  17%|█▋        | 42/250 [00:21<01:49,  1.89it/s]

 2023-02-06 16:47:53.721098 Epoch: 42, Training loss: 0.6135876476764679, Validation Loss: 0.6272563338279724 


Epoch:  18%|█▊        | 44/250 [00:22<01:46,  1.93it/s]

 2023-02-06 16:47:54.737945 Epoch: 44, Training loss: 0.6106352508068085, Validation Loss: 0.6160356402397156 


Epoch:  18%|█▊        | 46/250 [00:23<01:43,  1.96it/s]

 2023-02-06 16:47:55.743649 Epoch: 46, Training loss: 0.588046133518219, Validation Loss: 0.6037323474884033 


Epoch:  19%|█▉        | 48/250 [00:24<01:40,  2.01it/s]

 2023-02-06 16:47:56.719798 Epoch: 48, Training loss: 0.5905562043190002, Validation Loss: 0.5904428362846375 


Epoch:  20%|██        | 50/250 [00:25<01:39,  2.01it/s]

 2023-02-06 16:47:57.731422 Epoch: 50, Training loss: 0.5616617798805237, Validation Loss: 0.5760648250579834 


Epoch:  21%|██        | 52/250 [00:26<01:38,  2.02it/s]

 2023-02-06 16:47:58.720357 Epoch: 52, Training loss: 0.5366195142269135, Validation Loss: 0.5604910850524902 


Epoch:  22%|██▏       | 54/250 [00:27<01:37,  2.01it/s]

 2023-02-06 16:47:59.732331 Epoch: 54, Training loss: 0.5118328332901001, Validation Loss: 0.544067919254303 


Epoch:  22%|██▏       | 56/250 [00:28<01:37,  1.99it/s]

 2023-02-06 16:48:00.737431 Epoch: 56, Training loss: 0.4768238961696625, Validation Loss: 0.5268494486808777 


Epoch:  23%|██▎       | 58/250 [00:29<01:36,  2.00it/s]

 2023-02-06 16:48:01.730195 Epoch: 58, Training loss: 0.49171726405620575, Validation Loss: 0.5086962580680847 


Epoch:  24%|██▍       | 60/250 [00:30<01:40,  1.90it/s]

 2023-02-06 16:48:02.818062 Epoch: 60, Training loss: 0.47412264347076416, Validation Loss: 0.4903288781642914 


Epoch:  25%|██▍       | 62/250 [00:31<01:36,  1.94it/s]

 2023-02-06 16:48:03.823279 Epoch: 62, Training loss: 0.4443923383951187, Validation Loss: 0.4718162715435028 


Epoch:  26%|██▌       | 64/250 [00:32<01:34,  1.97it/s]

 2023-02-06 16:48:04.833485 Epoch: 64, Training loss: 0.42219896614551544, Validation Loss: 0.4531141221523285 


Epoch:  26%|██▋       | 66/250 [00:33<01:32,  1.98it/s]

 2023-02-06 16:48:05.833000 Epoch: 66, Training loss: 0.39767295122146606, Validation Loss: 0.43444785475730896 


Epoch:  27%|██▋       | 68/250 [00:34<01:31,  1.99it/s]

 2023-02-06 16:48:06.828405 Epoch: 68, Training loss: 0.3959670662879944, Validation Loss: 0.4160040318965912 


Epoch:  28%|██▊       | 70/250 [00:35<01:34,  1.90it/s]

 2023-02-06 16:48:07.911892 Epoch: 70, Training loss: 0.3594890236854553, Validation Loss: 0.398367702960968 


Epoch:  29%|██▉       | 72/250 [00:36<01:32,  1.92it/s]

 2023-02-06 16:48:08.939977 Epoch: 72, Training loss: 0.33937154710292816, Validation Loss: 0.38079240918159485 


Epoch:  30%|██▉       | 74/250 [00:37<01:32,  1.91it/s]

 2023-02-06 16:48:09.976276 Epoch: 74, Training loss: 0.3325663059949875, Validation Loss: 0.3635249435901642 


Epoch:  30%|███       | 76/250 [00:38<01:31,  1.90it/s]

 2023-02-06 16:48:11.013998 Epoch: 76, Training loss: 0.3062281608581543, Validation Loss: 0.3468628227710724 


Epoch:  31%|███       | 78/250 [00:39<01:28,  1.93it/s]

 2023-02-06 16:48:12.031628 Epoch: 78, Training loss: 0.3389716148376465, Validation Loss: 0.3307937979698181 


Epoch:  32%|███▏      | 80/250 [00:40<01:26,  1.97it/s]

 2023-02-06 16:48:13.027675 Epoch: 80, Training loss: 0.27835607528686523, Validation Loss: 0.3155674934387207 


Epoch:  33%|███▎      | 82/250 [00:41<01:24,  1.98it/s]

 2023-02-06 16:48:14.027463 Epoch: 82, Training loss: 0.26689913868904114, Validation Loss: 0.30073627829551697 


Epoch:  34%|███▎      | 84/250 [00:42<01:27,  1.90it/s]

 2023-02-06 16:48:15.153028 Epoch: 84, Training loss: 0.2133963257074356, Validation Loss: 0.2867397367954254 


Epoch:  34%|███▍      | 86/250 [00:43<01:22,  1.99it/s]

 2023-02-06 16:48:16.118629 Epoch: 86, Training loss: 0.23537135869264603, Validation Loss: 0.2736056447029114 


Epoch:  35%|███▌      | 88/250 [00:44<01:23,  1.95it/s]

 2023-02-06 16:48:17.164909 Epoch: 88, Training loss: 0.21292094886302948, Validation Loss: 0.26095736026763916 


Epoch:  36%|███▌      | 90/250 [00:45<01:19,  2.02it/s]

 2023-02-06 16:48:18.132195 Epoch: 90, Training loss: 0.2162911221385002, Validation Loss: 0.24904726445674896 


Epoch:  37%|███▋      | 92/250 [00:46<01:18,  2.02it/s]

 2023-02-06 16:48:19.119298 Epoch: 92, Training loss: 0.19760563969612122, Validation Loss: 0.2376074641942978 


Epoch:  38%|███▊      | 94/250 [00:47<01:17,  2.01it/s]

 2023-02-06 16:48:20.120158 Epoch: 94, Training loss: 0.19918788969516754, Validation Loss: 0.2265026718378067 


Epoch:  38%|███▊      | 96/250 [00:48<01:15,  2.03it/s]

 2023-02-06 16:48:21.106614 Epoch: 96, Training loss: 0.18060405552387238, Validation Loss: 0.21587756276130676 


Epoch:  39%|███▉      | 98/250 [00:49<01:17,  1.97it/s]

 2023-02-06 16:48:22.151611 Epoch: 98, Training loss: 0.1572396382689476, Validation Loss: 0.2058175653219223 


Epoch:  40%|████      | 100/250 [00:50<01:15,  1.98it/s]

 2023-02-06 16:48:23.154003 Epoch: 100, Training loss: 0.15949566662311554, Validation Loss: 0.19621774554252625 


Epoch:  41%|████      | 102/250 [00:51<01:13,  2.01it/s]

 2023-02-06 16:48:24.128527 Epoch: 102, Training loss: 0.1444564387202263, Validation Loss: 0.18708878755569458 


Epoch:  42%|████▏     | 104/250 [00:52<01:16,  1.92it/s]

 2023-02-06 16:48:25.221227 Epoch: 104, Training loss: 0.12298958748579025, Validation Loss: 0.17862901091575623 


Epoch:  42%|████▏     | 106/250 [00:53<01:14,  1.94it/s]

 2023-02-06 16:48:26.244504 Epoch: 106, Training loss: 0.11996760964393616, Validation Loss: 0.1702764332294464 


Epoch:  43%|████▎     | 108/250 [00:54<01:11,  1.99it/s]

 2023-02-06 16:48:27.226100 Epoch: 108, Training loss: 0.1378869004547596, Validation Loss: 0.16237372159957886 


Epoch:  44%|████▍     | 110/250 [00:55<01:11,  1.97it/s]

 2023-02-06 16:48:28.249395 Epoch: 110, Training loss: 0.11343086510896683, Validation Loss: 0.1547577977180481 


Epoch:  45%|████▍     | 112/250 [00:56<01:08,  2.01it/s]

 2023-02-06 16:48:29.228688 Epoch: 112, Training loss: 0.1182335652410984, Validation Loss: 0.14743995666503906 


Epoch:  46%|████▌     | 114/250 [00:57<01:12,  1.88it/s]

 2023-02-06 16:48:30.347847 Epoch: 114, Training loss: 0.0966435931622982, Validation Loss: 0.14054752886295319 


Epoch:  46%|████▋     | 116/250 [00:59<01:11,  1.88it/s]

 2023-02-06 16:48:31.417993 Epoch: 116, Training loss: 0.10260643437504768, Validation Loss: 0.13407766819000244 


Epoch:  47%|████▋     | 118/250 [01:00<01:12,  1.82it/s]

 2023-02-06 16:48:32.541532 Epoch: 118, Training loss: 0.09055427089333534, Validation Loss: 0.12785398960113525 


Epoch:  48%|████▊     | 120/250 [01:01<01:08,  1.90it/s]

 2023-02-06 16:48:33.553308 Epoch: 120, Training loss: 0.09534693136811256, Validation Loss: 0.12198004126548767 


Epoch:  49%|████▉     | 122/250 [01:02<01:04,  1.98it/s]

 2023-02-06 16:48:34.529951 Epoch: 122, Training loss: 0.08613143861293793, Validation Loss: 0.11645731329917908 


Epoch:  50%|████▉     | 124/250 [01:03<01:07,  1.88it/s]

 2023-02-06 16:48:35.632305 Epoch: 124, Training loss: 0.06582648493349552, Validation Loss: 0.11129452288150787 


Epoch:  50%|█████     | 126/250 [01:04<01:03,  1.95it/s]

 2023-02-06 16:48:36.617855 Epoch: 126, Training loss: 0.06696582958102226, Validation Loss: 0.10668976604938507 


Epoch:  51%|█████     | 128/250 [01:05<01:01,  1.97it/s]

 2023-02-06 16:48:37.628629 Epoch: 128, Training loss: 0.06255333498120308, Validation Loss: 0.10228854417800903 


Epoch:  52%|█████▏    | 130/250 [01:06<01:02,  1.91it/s]

 2023-02-06 16:48:38.706804 Epoch: 130, Training loss: 0.061941975727677345, Validation Loss: 0.0980934277176857 


Epoch:  53%|█████▎    | 132/250 [01:07<01:00,  1.94it/s]

 2023-02-06 16:48:39.723382 Epoch: 132, Training loss: 0.058481913059949875, Validation Loss: 0.09407994896173477 


Epoch:  54%|█████▎    | 134/250 [01:08<00:59,  1.96it/s]

 2023-02-06 16:48:40.744147 Epoch: 134, Training loss: 0.05584428645670414, Validation Loss: 0.09028683602809906 


Epoch:  54%|█████▍    | 136/250 [01:09<00:57,  1.99it/s]

 2023-02-06 16:48:41.727776 Epoch: 136, Training loss: 0.0500300619751215, Validation Loss: 0.08680219203233719 


Epoch:  55%|█████▌    | 138/250 [01:10<00:59,  1.89it/s]

 2023-02-06 16:48:42.821970 Epoch: 138, Training loss: 0.056514350697398186, Validation Loss: 0.08354607969522476 


Epoch:  56%|█████▌    | 140/250 [01:11<00:56,  1.94it/s]

 2023-02-06 16:48:43.828793 Epoch: 140, Training loss: 0.06381128914654255, Validation Loss: 0.08026836067438126 


Epoch:  57%|█████▋    | 142/250 [01:12<00:57,  1.89it/s]

 2023-02-06 16:48:44.898629 Epoch: 142, Training loss: 0.03655182849615812, Validation Loss: 0.07723657041788101 


Epoch:  58%|█████▊    | 144/250 [01:13<00:56,  1.87it/s]

 2023-02-06 16:48:45.997045 Epoch: 144, Training loss: 0.041166823357343674, Validation Loss: 0.0743926540017128 


Epoch:  58%|█████▊    | 146/250 [01:14<00:52,  1.97it/s]

 2023-02-06 16:48:46.944547 Epoch: 146, Training loss: 0.043942222371697426, Validation Loss: 0.07167702913284302 


Epoch:  59%|█████▉    | 148/250 [01:15<00:53,  1.91it/s]

 2023-02-06 16:48:48.024488 Epoch: 148, Training loss: 0.04425273463129997, Validation Loss: 0.06908435374498367 


Epoch:  60%|██████    | 150/250 [01:16<00:51,  1.95it/s]

 2023-02-06 16:48:49.032770 Epoch: 150, Training loss: 0.03759705927222967, Validation Loss: 0.06663280725479126 


Epoch:  61%|██████    | 152/250 [01:17<00:49,  1.99it/s]

 2023-02-06 16:48:50.019740 Epoch: 152, Training loss: 0.03213907405734062, Validation Loss: 0.06432150304317474 


Epoch:  62%|██████▏   | 154/250 [01:18<00:48,  1.99it/s]

 2023-02-06 16:48:51.028410 Epoch: 154, Training loss: 0.04473158158361912, Validation Loss: 0.06209945306181908 


Epoch:  62%|██████▏   | 156/250 [01:19<00:47,  1.97it/s]

 2023-02-06 16:48:52.044787 Epoch: 156, Training loss: 0.038584670051932335, Validation Loss: 0.05994141101837158 


Epoch:  63%|██████▎   | 158/250 [01:20<00:46,  1.96it/s]

 2023-02-06 16:48:53.065058 Epoch: 158, Training loss: 0.034233883023262024, Validation Loss: 0.05789573863148689 


Epoch:  64%|██████▍   | 160/250 [01:21<00:46,  1.92it/s]

 2023-02-06 16:48:54.113081 Epoch: 160, Training loss: 0.025491340085864067, Validation Loss: 0.055970095098018646 


Epoch:  65%|██████▍   | 162/250 [01:22<00:45,  1.94it/s]

 2023-02-06 16:48:55.143569 Epoch: 162, Training loss: 0.024861506186425686, Validation Loss: 0.05421038717031479 


Epoch:  66%|██████▌   | 164/250 [01:23<00:43,  1.98it/s]

 2023-02-06 16:48:56.152471 Epoch: 164, Training loss: 0.026232723146677017, Validation Loss: 0.0524667352437973 


Epoch:  66%|██████▋   | 166/250 [01:24<00:41,  2.01it/s]

 2023-02-06 16:48:57.130941 Epoch: 166, Training loss: 0.025483685545623302, Validation Loss: 0.05082010477781296 


Epoch:  67%|██████▋   | 168/250 [01:25<00:42,  1.92it/s]

 2023-02-06 16:48:58.229272 Epoch: 168, Training loss: 0.030919436365365982, Validation Loss: 0.0492519848048687 


Epoch:  68%|██████▊   | 170/250 [01:26<00:43,  1.85it/s]

 2023-02-06 16:48:59.328458 Epoch: 170, Training loss: 0.022003312595188618, Validation Loss: 0.047780733555555344 


Epoch:  69%|██████▉   | 172/250 [01:27<00:40,  1.91it/s]

 2023-02-06 16:49:00.342568 Epoch: 172, Training loss: 0.02310791052877903, Validation Loss: 0.0464162603020668 


Epoch:  70%|██████▉   | 174/250 [01:29<00:40,  1.89it/s]

 2023-02-06 16:49:01.402057 Epoch: 174, Training loss: 0.023309774696826935, Validation Loss: 0.04517452418804169 


Epoch:  70%|███████   | 176/250 [01:29<00:36,  2.01it/s]

 2023-02-06 16:49:02.335420 Epoch: 176, Training loss: 0.02400297112762928, Validation Loss: 0.0440015085041523 


Epoch:  71%|███████   | 178/250 [01:31<00:36,  1.96it/s]

 2023-02-06 16:49:03.382561 Epoch: 178, Training loss: 0.026934927329421043, Validation Loss: 0.042837534099817276 


Epoch:  72%|███████▏  | 180/250 [01:31<00:35,  2.00it/s]

 2023-02-06 16:49:04.354590 Epoch: 180, Training loss: 0.023598063737154007, Validation Loss: 0.04173453524708748 


Epoch:  73%|███████▎  | 182/250 [01:32<00:33,  2.00it/s]

 2023-02-06 16:49:05.343790 Epoch: 182, Training loss: 0.023915368132293224, Validation Loss: 0.040675900876522064 


Epoch:  74%|███████▎  | 184/250 [01:33<00:33,  2.00it/s]

 2023-02-06 16:49:06.343032 Epoch: 184, Training loss: 0.01598828937858343, Validation Loss: 0.03966556861996651 


Epoch:  74%|███████▍  | 186/250 [01:35<00:33,  1.89it/s]

 2023-02-06 16:49:07.446063 Epoch: 186, Training loss: 0.018683391623198986, Validation Loss: 0.038704242557287216 


Epoch:  75%|███████▌  | 188/250 [01:36<00:32,  1.89it/s]

 2023-02-06 16:49:08.502507 Epoch: 188, Training loss: 0.01784104062244296, Validation Loss: 0.0377470999956131 


Epoch:  76%|███████▌  | 190/250 [01:37<00:31,  1.92it/s]

 2023-02-06 16:49:09.531556 Epoch: 190, Training loss: 0.02121424302458763, Validation Loss: 0.03679903224110603 


Epoch:  77%|███████▋  | 192/250 [01:38<00:30,  1.88it/s]

 2023-02-06 16:49:10.612090 Epoch: 192, Training loss: 0.015683818142861128, Validation Loss: 0.035883814096450806 


Epoch:  78%|███████▊  | 194/250 [01:39<00:30,  1.85it/s]

 2023-02-06 16:49:11.703908 Epoch: 194, Training loss: 0.024875455535948277, Validation Loss: 0.03505605831742287 


Epoch:  78%|███████▊  | 196/250 [01:40<00:28,  1.87it/s]

 2023-02-06 16:49:12.756183 Epoch: 196, Training loss: 0.013159333728253841, Validation Loss: 0.03425375744700432 


Epoch:  79%|███████▉  | 198/250 [01:41<00:26,  1.97it/s]

 2023-02-06 16:49:13.729094 Epoch: 198, Training loss: 0.01384609891101718, Validation Loss: 0.03347990661859512 


Epoch:  80%|████████  | 200/250 [01:42<00:25,  1.97it/s]

 2023-02-06 16:49:14.737697 Epoch: 200, Training loss: 0.016837109811604023, Validation Loss: 0.032724231481552124 


Epoch:  81%|████████  | 202/250 [01:43<00:24,  1.98it/s]

 2023-02-06 16:49:15.744808 Epoch: 202, Training loss: 0.014625389128923416, Validation Loss: 0.03197580575942993 


Epoch:  82%|████████▏ | 204/250 [01:44<00:23,  2.00it/s]

 2023-02-06 16:49:16.741813 Epoch: 204, Training loss: 0.023424511309713125, Validation Loss: 0.03126185014843941 


Epoch:  82%|████████▏ | 206/250 [01:45<00:22,  1.93it/s]

 2023-02-06 16:49:17.828753 Epoch: 206, Training loss: 0.01643827836960554, Validation Loss: 0.030563360080122948 


Epoch:  83%|████████▎ | 208/250 [01:46<00:21,  1.94it/s]

 2023-02-06 16:49:18.847856 Epoch: 208, Training loss: 0.015180379152297974, Validation Loss: 0.02988767810165882 


Epoch:  84%|████████▍ | 210/250 [01:47<00:20,  1.97it/s]

 2023-02-06 16:49:19.848329 Epoch: 210, Training loss: 0.008660978637635708, Validation Loss: 0.02925516851246357 


Epoch:  85%|████████▍ | 212/250 [01:48<00:18,  2.02it/s]

 2023-02-06 16:49:20.815912 Epoch: 212, Training loss: 0.01789429923519492, Validation Loss: 0.028638824820518494 


Epoch:  86%|████████▌ | 214/250 [01:49<00:20,  1.77it/s]

 2023-02-06 16:49:22.038319 Epoch: 214, Training loss: 0.016089586541056633, Validation Loss: 0.02804049290716648 


Epoch:  86%|████████▋ | 216/250 [01:50<00:18,  1.89it/s]

 2023-02-06 16:49:23.032362 Epoch: 216, Training loss: 0.015451650135219097, Validation Loss: 0.027452055364847183 


Epoch:  87%|████████▋ | 218/250 [01:51<00:16,  1.94it/s]

 2023-02-06 16:49:24.030183 Epoch: 218, Training loss: 0.012382915709167719, Validation Loss: 0.026882819831371307 


Epoch:  88%|████████▊ | 220/250 [01:52<00:15,  1.88it/s]

 2023-02-06 16:49:25.115874 Epoch: 220, Training loss: 0.010557342320680618, Validation Loss: 0.02633460983633995 


Epoch:  89%|████████▉ | 222/250 [01:53<00:14,  1.94it/s]

 2023-02-06 16:49:26.119860 Epoch: 222, Training loss: 0.011401607654988766, Validation Loss: 0.025834770873188972 


Epoch:  90%|████████▉ | 224/250 [01:54<00:13,  1.95it/s]

 2023-02-06 16:49:27.136828 Epoch: 224, Training loss: 0.012445876840502024, Validation Loss: 0.025341033935546875 


Epoch:  90%|█████████ | 226/250 [01:55<00:12,  1.89it/s]

 2023-02-06 16:49:28.233721 Epoch: 226, Training loss: 0.010714303236454725, Validation Loss: 0.0248603206127882 


Epoch:  91%|█████████ | 228/250 [01:56<00:11,  1.94it/s]

 2023-02-06 16:49:29.233821 Epoch: 228, Training loss: 0.009831741219386458, Validation Loss: 0.024393565952777863 


Epoch:  92%|█████████▏| 230/250 [01:57<00:10,  1.98it/s]

 2023-02-06 16:49:30.231422 Epoch: 230, Training loss: 0.010521480813622475, Validation Loss: 0.023950347676873207 


Epoch:  93%|█████████▎| 232/250 [01:58<00:09,  1.99it/s]

 2023-02-06 16:49:31.238623 Epoch: 232, Training loss: 0.007581221172586083, Validation Loss: 0.023535508662462234 


Epoch:  94%|█████████▎| 234/250 [01:59<00:08,  1.91it/s]

 2023-02-06 16:49:32.312755 Epoch: 234, Training loss: 0.010327600874006748, Validation Loss: 0.023153260350227356 


Epoch:  94%|█████████▍| 236/250 [02:00<00:07,  1.90it/s]

 2023-02-06 16:49:33.363341 Epoch: 236, Training loss: 0.013736754190176725, Validation Loss: 0.022777969017624855 


Epoch:  95%|█████████▌| 238/250 [02:01<00:05,  2.00it/s]

 2023-02-06 16:49:34.315304 Epoch: 238, Training loss: 0.012321268673986197, Validation Loss: 0.022413143888115883 


Epoch:  96%|█████████▌| 240/250 [02:02<00:05,  1.99it/s]

 2023-02-06 16:49:35.336831 Epoch: 240, Training loss: 0.009346939157694578, Validation Loss: 0.022054487839341164 


Epoch:  97%|█████████▋| 242/250 [02:04<00:04,  1.87it/s]

 2023-02-06 16:49:36.444759 Epoch: 242, Training loss: 0.009967150166630745, Validation Loss: 0.021695680916309357 


Epoch:  98%|█████████▊| 244/250 [02:05<00:03,  1.96it/s]

 2023-02-06 16:49:37.420197 Epoch: 244, Training loss: 0.008165040984749794, Validation Loss: 0.021351510658860207 


Epoch:  98%|█████████▊| 246/250 [02:06<00:02,  1.93it/s]

 2023-02-06 16:49:38.467138 Epoch: 246, Training loss: 0.011859591584652662, Validation Loss: 0.020989414304494858 


Epoch:  99%|█████████▉| 248/250 [02:07<00:01,  1.99it/s]

 2023-02-06 16:49:39.441997 Epoch: 248, Training loss: 0.006795366061851382, Validation Loss: 0.020632246509194374 


Epoch: 100%|██████████| 250/250 [02:08<00:00,  1.95it/s]

 2023-02-06 16:49:40.508095 Epoch: 250, Training loss: 0.006349924253299832, Validation Loss: 0.02028319425880909 





In [497]:
#print(train_loss_per_epoch, train_acc_per_epoch, val_loss_per_epoch, val_acc_per_epoch)

In [498]:
def val_vs_train_loss(epochs, train_loss, val_loss):
    ''' 
    Plotting validation versus training loss over time
    epochs: number of epochs that the model ran (int. hyperparameter)
    train_loss: training loss per epoch (python list)
    val_loss: validation loss per epoch (python list)
    ''' 
    loss_path_to_save = '/home/jovyan/Tomics-CP-Chem-MoA/01_CStructure_Models/saved_images/pre_split'
    plt.figure()
    x_axis = list(range(1, epochs +1)) # create x axis with number of
    plt.plot(x_axis, train_loss, label = "train_loss")
    plt.plot(x_axis, val_loss, label = "val_loss")
    # Figure description
    plt.xlabel('# of Epochs')
    plt.ylabel('Loss')
    plt.title('Validation versus Training Loss: CP Image Model')
    plt.legend()
    # plot
    plt.savefig(loss_path_to_save + '/' + 'loss_train_val_' + now)


In [499]:
def val_vs_train_accuracy(epochs, train_acc, val_acc):
    '''
    Plotting validation versus training loss over time
    epochs: number of epochs that the model ran (int. hyperparameter)
    train_acc: accuracy loss per epoch (python list)
    val_acc: accuracy loss per epoch (python list)
    '''
    acc_path_to_save = '/home/jovyan/Tomics-CP-Chem-MoA/01_CStructure_Models/saved_images/pre_split'
    plt.figure()
    x_axis = list(range(1, epochs +1)) # create x axis with number of
    plt.plot(x_axis, train_acc, label = "train_acc")
    plt.plot(x_axis, val_acc, label = "val_acc")
    # Figure description
    plt.xlabel('# of Epochs')
    plt.ylabel('Accuracy')
    plt.title('Validation versus Training Accuracy: CP Image Model')
    plt.legend()
    # plot
    plt.savefig(acc_path_to_save + '/' + 'acc_train_val_' + now)

In [500]:
val_vs_train_loss(max_epochs,train_loss_per_epoch, val_loss_per_epoch)

In [501]:
val_vs_train_accuracy(max_epochs, train_acc_per_epoch, val_acc_per_epoch)

In [502]:
correct, total, avg_test_loss = test_loop(model = seq_model,
                                          loss_fn = loss_function, 
                                          test_loader = test_generator)

  0%|          | 0/1 [00:00<?, ?it/s]

In [503]:
correct/total
# create PyTorch architecture

0.9285714285714286