In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pickle
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

# set seed
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

CUDA is available!  Training on GPU ...


### 1. Dataloader
 source: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [3]:
class CustomImageDataset(Dataset):
   
    def __init__(self, csv_file, root_dir, transform=None):
        self.df = pd.read_csv(root_dir + csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        img_name = os.path.join(
            self.root_dir,
            str(self.df.iloc[idx]['molecule_name']) + '_' + str(self.df.iloc[idx]['id']) + '.pkl'
        )
        
        with open (img_name, 'rb') as fp:
            image = pickle.load(fp)
        
        for c in range(5):
            image[c] = np.clip(image[c], 0, 255) / 255
        
        img = torch.from_numpy(np.array(image))
        img = img.type(torch.FloatTensor)
        
        sample = {'image': img,
                  'target': self.df.iloc[idx]['scalar_coupling_constant']}

        if self.transform:
            sample['image'] = self.transform(sample['image'])

        return sample['image'], sample['target']

In [4]:
images_path ="../Data/full-images-2jhn/Image_2JHN/"

image_dataset = CustomImageDataset(
    csv_file='description.csv',
    root_dir=images_path)

In [7]:
image_dataset[3][0].shape

torch.Size([5, 35, 35])

## 2. Cross validation

In [10]:
from sklearn.model_selection import GroupKFold
group_kfold = GroupKFold(n_splits=5)

df = pd.read_csv(images_path + 'description.csv', index_col = 0)
df.reset_index(drop=True, inplace=True)

X = df[['id', 'molecule_name']].copy()
y = df['scalar_coupling_constant']
groups = df['molecule_name'].unique()

# folds = []
# for train_idx, valid_idx in group_kfold.split(X, y, X['molecule_name']):
#     folds.append([train_idx, valid_idx])

In [11]:
ids = np.random.permutation(df.shape[0])

In [13]:
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 512

# convert data to a normalized torch.FloatTensor
transform = transforms.Compose([
    transforms.Normalize((0.5, 0.5, 0.5, 0.5, 0.5), (0.5, 0.5, 0.5, 0.5, 0.5))
    ])

train_data = CustomImageDataset(
    csv_file='description.csv',
    root_dir = images_path,
#     transform=transform
)

#train_idx, valid_idx = folds[index_fold][0], folds[index_fold][1]
train_idx, valid_idx = ids[:-8000], ids[-8000:]
# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=batch_size,
                                           sampler=train_sampler,
                                           num_workers=num_workers
                                          )
valid_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=batch_size, 
                                           sampler=valid_sampler,
                                           num_workers=num_workers
                                          )

In [14]:
class CNN(nn.Module):
    """CNN."""

    def __init__(self):
        """CNN Builder."""
        super(CNN, self).__init__()

        self.conv_layer = nn.Sequential(

            # Conv Layer block 1
            nn.Conv2d(in_channels=5, out_channels=32, kernel_size=5),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            #nn.MaxPool2d(kernel_size=3, stride=2),

            # Conv Layer block 2
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=5),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=5),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            #nn.MaxPool2d(kernel_size=3, stride=2),
            #nn.Dropout2d(p=0.05),

            # Conv Layer block 3
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=5),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=5),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            #nn.MaxPool2d(kernel_size=3, stride=2),
        )

        self.fc_layer = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(30976, 1),
        )

    def forward(self, x):
        x = self.conv_layer(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layer(x)
        return x

In [17]:
# x = next(iter(train_loader))[0].to('cuda')

# model.conv_layer(x).shape

torch.Size([512, 256, 11, 11])

In [None]:
# create a complete CNN
model = CNN().to('cuda')

In [21]:
#import torch.optim as optim
from torch.nn.utils import clip_grad_value_
from functions_refactor import RAdam
# specify loss function (categorical cross-entropy)
criterion = nn.SmoothL1Loss()

# specify optimizer
#optimizer = optim.Adam(model.parameters(), lr=0.0001)
optimizer = RAdam(model.parameters(), lr=0.0001,weight_decay=1e-2)
scheduler = ReduceLROnPlateau(optimizer, 'min',factor=0.5,patience=5)
clip = 2

In [22]:
import copy
import time
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_value_


def train_cnn(model,optimizer,train_loader,valid_loader,n_epochs,clip,scheduler):
    start_time = time.time()
    criterion = nn.SmoothL1Loss()

    valid_loss_min = np.Inf # track change in validation loss
    for epoch in range(1, n_epochs+1):
        
        # keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        ###################
        # train the model #
        ###################
        model.train()
        for ind, (data, target) in enumerate(train_loader):
            print(ind, end='\r')
            
            data, target = data.cuda(), target.cuda()
            
            optimizer.zero_grad()
            output = model(data)
            
            loss = criterion(output.view(data.shape[0]), target.float())
            loss.backward()
            clip_grad_value_(model.parameters(),clip)
            optimizer.step()
        
            train_loss += loss.item()*data.size(0)
            
        ######################    
        # validate the model #
        ######################
        model.eval()
        with torch.no_grad():
            for data, target in valid_loader:
                data, target = data.cuda(), target.cuda()
                output = model(data)    
                loss = criterion(output.view(data.shape[0]), target.float())    
                valid_loss += loss.item() * data.size(0)
     
        # calculate average losses
        train_loss = train_loss / len(train_loader.sampler)
        valid_loss = valid_loss / len(valid_loader.sampler)
            
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, train_loss, valid_loss))
    
        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            bestWeight = copy.deepcopy(model.state_dict())
        
        scheduler.step(valid_loss)
    
    model.load_state_dict(bestWeight)
    time_elapsed = time.time() - start_time
    print('Training completed in {}s'.format(time_elapsed))        
    
    return model


In [24]:
model = train_cnn(model,optimizer,train_loader,valid_loader,2,clip,scheduler)

Epoch: 1 	Training Loss: 0.022865 	Validation Loss: 0.035242
Epoch: 2 	Training Loss: 0.026647 	Validation Loss: 0.037225
Training completed in 228.306001663208s


In [17]:
import copy
import time
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_value_


def train_cnn(model,optimizer,train_loader,valid_loader,n_epochs,clip,scheduler):
    start_time = time.time()
    criterion = nn.SmoothL1Loss()

    valid_loss_min = np.Inf # track change in validation loss
    for epoch in range(1, n_epochs+1):
        
        # keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        ###################
        # train the model #
        ###################
        model.train()
        for ind, (data, target) in enumerate(train_loader):
            print(ind, end='\r')
            
            data, target = data.cuda(), target.cuda()
            
            optimizer.zero_grad()
            output = model(data)
            
            loss = criterion(output.view(data.shape[0]), target.float())
            loss.backward()
            clip_grad_value_(model.parameters(),clip)
            optimizer.step()
        
            train_loss += loss.item()*data.size(0)
            
        ######################    
        # validate the model #
        ######################
        model.eval()
        with torch.no_grad():
            for data, target in valid_loader:
                data, target = data.cuda(), target.cuda()
                output = model(data)    
                loss = criterion(output.view(data.shape[0]), target.float())    
                valid_loss += loss.item() * data.size(0)
     
        # calculate average losses
        train_loss = train_loss / len(train_loader.sampler)
        valid_loss = valid_loss / len(valid_loader.sampler)
            
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, train_loss, valid_loss))
    
        # save model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            bestWeight = copy.deepcopy(model.state_dict())
        
        scheduler.step(valid_loss)
    
    model.load_state_dict(bestWeight)
    time_elapsed = time.time() - start_time
    print('Training completed in {}s'.format(time_elapsed))        
    
    return model

Epoch: 1 	Training Loss: 1.031424 	Validation Loss: 0.345287
Epoch: 2 	Training Loss: 0.241303 	Validation Loss: 0.148097
Epoch: 3 	Training Loss: 0.169755 	Validation Loss: 0.112682
Epoch: 4 	Training Loss: 0.134530 	Validation Loss: 0.210637
Epoch: 5 	Training Loss: 0.114336 	Validation Loss: 0.087889
Epoch: 6 	Training Loss: 0.100729 	Validation Loss: 0.079452
Epoch: 7 	Training Loss: 0.087231 	Validation Loss: 0.081158
Epoch: 8 	Training Loss: 0.072704 	Validation Loss: 0.079030
Epoch: 9 	Training Loss: 0.066334 	Validation Loss: 0.062239
Epoch: 10 	Training Loss: 0.061051 	Validation Loss: 0.086579
Epoch: 11 	Training Loss: 0.058313 	Validation Loss: 0.059595
Epoch: 12 	Training Loss: 0.055323 	Validation Loss: 0.047605
Epoch: 13 	Training Loss: 0.046146 	Validation Loss: 0.048918
Epoch: 14 	Training Loss: 0.042480 	Validation Loss: 0.157026
Epoch: 15 	Training Loss: 0.044765 	Validation Loss: 0.052235
Epoch: 16 	Training Loss: 0.041266 	Validation Loss: 0.056339
Epoch: 17 	Traini

KeyboardInterrupt: 