## 1. https://github.com/joshchang1112/bert_gnn_arxiv

In [6]:

import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
import transformers
import numpy as np
import pandas as pd
import csv
from torch_geometric.nn import GCNConv, SAGEConv, GATConv
from ogb.nodeproppred.dataset_pyg import PygNodePropPredDataset
from ogb.nodeproppred import Evaluator
#from tqdm.notebook import tqdm
import pickle
import os
import json
import torch
import numpy as np
import random
from tqdm import tqdm


print("Pytorch Version: ",  torch.__version__)
if torch.cuda.is_available():
  print("GPU {} is available!".format(torch.cuda.current_device()))
else:
  print("Only CPU is available!")

Pytorch Version:  1.13.1
GPU 0 is available!


In [None]:
config = {    
    "encoder": "bert",
    "node2paper": "dataset/ogbn_arxiv/mapping/node2paper.pkl",
    "paper2node": "dataset/ogbn_arxiv/mapping/paper2node.pkl",
    "raw_text_path": "dataset/ogbn_arxiv/raw/titleabs.tsv",
    "train": "dataset/ogbn_arxiv/bert/train.pkl",
    "valid": "dataset/ogbn_arxiv/bert/valid.pkl",
    "test": "dataset/ogbn_arxiv/bert/test.pkl",
    "bert_models": "models/fine-tuned_bert_{}.pkl",
    "node_features": "node_feat/bert_feat_{}.pkl" 
}

In [None]:
from ogb.nodeproppred.dataset_pyg import PygNodePropPredDataset
import torch_geometric.transforms as T
dataset = PygNodePropPredDataset(name='ogbn-arxiv', transform=T.ToSparseTensor())

In [None]:
TRAIN_ID_PATH = 'dataset/ogbn_arxiv/split/time/train.csv.gz'
VALID_ID_PATH = 'dataset/ogbn_arxiv/split/time/valid.csv.gz'
TEST_ID_PATH = 'dataset/ogbn_arxiv/split/time/test.csv.gz'
LABEL_PATH = 'dataset/ogbn_arxiv/raw/node-label.csv.gz'
NODE2PAPER_PATH = 'dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz'
RAW_DATA_PATH = 'dataset/ogbn_arxiv/raw/titleabs.tsv'

In [None]:
raw_data = pd.read_csv(RAW_DATA_PATH, sep='\t', header=None)
raw_data.columns = ['Id', 'Title', 'Abstract']
raw_data.iloc[0, 0] = 200971
raw_data = raw_data.drop(len(raw_data)-1)

node2paper = pd.read_csv(NODE2PAPER_PATH)
train_idx = pd.read_csv(TRAIN_ID_PATH, header=None)
val_idx = pd.read_csv(VALID_ID_PATH, header=None)
test_idx = pd.read_csv(TEST_ID_PATH, header=None)
label = pd.read_csv(LABEL_PATH, header=None)

train_idx = train_idx.iloc[:, 0].tolist()
val_idx = val_idx.iloc[:, 0].tolist()
test_idx = test_idx.iloc[:, 0].tolist()
label = label.iloc[:, 0].tolist()

paper2node_dict = {}
node2paper_dict = {}

for i, row in tqdm(node2paper.iterrows()):
    paper2node_dict[int(row[1])] = int(row[0])
    node2paper_dict[int(row[0])] = int(row[1])
     

In [None]:
from transformers import BertTokenizer
train = []
val = []
test = []
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
for i, row in tqdm(raw_data.iterrows()):
    if int(row['Id']) not in paper2node_dict:
        continue
    processed = {}
    processed['context'] = tokenizer.tokenize(text=row['Title']+row['Abstract'])
    processed['context'] = tokenizer.convert_tokens_to_ids(processed['context'])
    processed['length'] = len(processed['context'])
    processed['id'] = paper2node_dict[int(row['Id'])]
    processed['label'] = label[int(paper2node_dict[int(row['Id'])])]
    
    if processed['id'] in train_idx:
        train.append(processed)
    elif processed['id'] in val_idx:
        val.append(processed)
    elif processed['id'] in test_idx:
        test.append(processed)
    else:
        print("NOT MATCH!!!!!")
        break

In [None]:
class Args(object):
  """Hyperparameters used for training BERT."""
  def __init__(self):
    ### dataset parameters
    self.num_classes = 40
    self.max_seq_length = 500
    ### training parameters
    self.train_epochs = 2
    self.batch_size = 8
    self.learning_rate = 2e-5
    self.dropout_rate = 0.5
    ### eval parameters
    self.eval_steps = 4000

args = Args()
     

In [None]:
from torch.utils.data import Dataset, DataLoader

def pad_to_len(arr, padded_len, padding=0):
    length_arr = len(arr)
    new_arr = arr
    if length_arr < padded_len:
        for i in range(padded_len - length_arr):
            new_arr.append(padding)
    else:
        for i in range(length_arr - padded_len):
            del new_arr[-2]
    return new_arr

class CitationDataset(Dataset):

    def __init__(self, data, max_seq_len, padding=0):
        self.data = data
        self.max_seq_len = max_seq_len
        self.padding = padding

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = dict(self.data[index])
        if len(data['context']) > self.max_seq_len:
            data['context'] = data['context'][:self.max_seq_len]
        return data

    def collate_fn(self, datas):
        batch = {}
        batch['length'] = torch.LongTensor([data['length'] for data in datas])
        padded_len = min(self.max_seq_len, max(batch['length']))
        batch['context'] = torch.tensor(
            [pad_to_len(data['context'], padded_len, self.padding)
             for data in datas]
        )
        batch['label'] = torch.LongTensor([data['label'] for data in datas])
        return batch

train_dataset = CitationDataset(train, max_seq_len=args.max_seq_length)
valid_dataset = CitationDataset(val, max_seq_len=args.max_seq_length)

train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, 
    collate_fn=train_dataset.collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False,
    collate_fn=valid_dataset.collate_fn)

In [None]:
from sklearn.metrics import accuracy_score
class Metrics:
    def __init__(self):
        self.name = 'Metric Name'

    def reset(self):
        pass

    def update(self, predicts, batch):
        pass

    def get_score(self):
        pass

class Accuracy(Metrics):
    """
    Args:
         ats (int): @ to eval.
         rank_na (bool): whether to consider no answer.
    """
    def __init__(self):
        self.n = 0
        self.name = 'Accuracy'
        self.match = 0

    def reset(self):
        self.n = 0
        self.match = 0
        
    def update(self, predicts, label):
        """
        Args:
            predicts (FloatTensor): with size (batch, n_samples).
            batch (dict): batch.
        """
        predicts, label = predicts.cpu(), label.cpu()
        batch_size = list(predicts.size())[0]
        _, y_pred = torch.max(predicts, dim=1)
        self.match += accuracy_score(label, y_pred, normalize=False)
        self.n += batch_size
    
    def print_score(self):
        acc = self.match / self.n
        #self.get_category_f1()
        return '{:.4f}'.format(acc)
     

In [None]:
def run_iter(batch, model, device, training):
    context, context_lens = batch['context'].to(device), batch['length'].to(device)
    batch_size = context.size()[0]
    max_context_len = context.size()[1]
    padding_mask = []
    for j in range(batch_size):
        if context_lens[j] < max_context_len:
            tmp = [1] * context_lens[j] + [0] * (max_context_len - context_lens[j])
        else:
            tmp = [1] * max_context_len
        padding_mask.append(tmp)

    padding_mask = torch.Tensor(padding_mask).to(device)
    if training:
        prob = model(context, attention_mask=padding_mask)[0]
    else:
        with torch.no_grad():
            prob = model(context, attention_mask=padding_mask)[0]
    return prob

In [None]:
def training(train_loader, valid_loader, model, optimizer, epochs, eval_steps, device):
    train_metrics = Accuracy()
    best_valid_acc = 0
    total_iter = 0
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(epochs):
        train_trange = tqdm(enumerate(train_loader), total=len(train_loader), desc='training')
        train_loss = 0
        train_metrics.reset()
        for i, batch in train_trange:
            model.train()
            prob = run_iter(batch, model, device, training=True)
            answer = batch['label'].to(device)
            loss = criterion(prob, answer)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_iter += 1
            train_loss += loss.item()
            train_metrics.update(prob, answer)
            train_trange.set_postfix(loss= train_loss/(i+1),
                                     **{train_metrics.name: train_metrics.print_score()})
            
            if total_iter % eval_steps == 0:
                valid_acc = testing(valid_loader, model, device, valid=True)
                if valid_acc > best_valid_acc:
                    best_valid_acc = valid_acc
                    torch.save(model, 'best_val.pkl')

In [None]:
def testing(dataloader, model, device, valid):
    metrics = Accuracy()
    criterion = torch.nn.CrossEntropyLoss()
    trange = tqdm(enumerate(dataloader), total=len(dataloader), desc='validation' if valid else 'testing')
    model.eval()
    total_loss = 0
    metrics.reset()
    for k, batch in trange:
        model.eval()
        prob = run_iter(batch, model, device, training=False)
        answer = batch['label'].to(device)
        loss = criterion(prob, batch['label'].to(device))
        total_loss += loss.item()
        metrics.update(prob, answer)
        trange.set_postfix(loss= total_loss/(k+1),
                           **{metrics.name: metrics.print_score()})
    acc = metrics.match / metrics.n
    return acc

In [None]:
from transformers import BertForSequenceClassification
from torch.optim import Adam
device = torch.device('cuda:{}'.format(torch.cuda.current_device()) 
                       if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                      num_labels=args.num_classes).to(device)
optimizer = Adam(model.parameters(), lr=args.learning_rate)
training(train_loader, valid_loader, model, optimizer, args.train_epochs, args.eval_steps, device)

In [None]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [None]:
# pip install ogb
# pip install transformers
#Utils


def set_seed(SEED=0):
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    random.seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def load_data(config_path):
    """ Load dataset preprocess by tokenizer."""
    with open(config_path) as f:
        config = json.load(f)
    with open(config['train'], 'rb') as f:
        train = pickle.load(f)
    with open(config['valid'], 'rb') as f:
        valid = pickle.load(f)
    with open(config['test'], 'rb') as f:
        test = pickle.load(f)
    return train, valid, test

def pad_to_len(arr, padded_len, padding=0):
    length_arr = len(arr)
    new_arr = arr
    if length_arr < padded_len:
        for i in range(padded_len - length_arr):
            new_arr.append(padding)
    else:
        for i in range(length_arr - padded_len):
            del new_arr[-2]
    return 

In [None]:
## Dataset

import torch
from torch.utils.data import Dataset

class CitationDataset(Dataset):

    def __init__(self, data, max_length, padding=0):
        self.data = data
        self.max_seq_len = max_length
        self.padding = padding

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = dict(self.data[index])
        if len(data['context']) > self.max_seq_len:
            data['context'] = data['context'][:self.max_seq_len]
        return data

    def collate_fn(self, datas):
        batch = {}
        batch['length'] = torch.LongTensor([data['length'] for data in datas])
        padded_len = min(self.max_seq_len, max(batch['length']))
        batch['context'] = torch.tensor(
            [pad_to_len(data['context'], padded_len, self.padding)
             for data in datas]
        )
        batch['label'] = torch.LongTensor([data['label'] for data in datas])
        return batch

In [None]:
## Make_bert_dataset
import torch
import pandas as pd
import pickle
import os
import json
from tqdm import tqdm
from transformers import BertTokenizer

def tokenize(data, paper2node, idx, label):
    """Tokenize and convert word token to ids"""
    train, valid, test = [], [], []
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    for i, row in tqdm(data.iterrows()):
        if int(row['Id']) not in paper2node:
            continue
        processed = {}
        processed['context'] = tokenizer.tokenize(text="[CLS] " + row['Title']+row['Abstract'] + " [SEP]")
        processed['context'] = tokenizer.convert_tokens_to_ids(processed['context'])
        processed['length'] = len(processed['context'])
        processed['id'] = paper2node[int(row['Id'])]
        processed['label'] = label[int(paper2node[int(row['Id'])])]
    
        if processed['id'] in idx['train']:
            train.append(processed)
        elif processed['id'] in idx['valid']:
            valid.append(processed)
        elif processed['id'] in idx['test']:
            test.append(processed)
        else:
            print("NOT MATCH!!!!!")
            break

    return train, valid, test

def main():

    #with open('config.json') as f:
    #    config = json.load(f)
    if os.path.isdir(os.path.join('dataset/ogbn_arxiv', config['encoder'])) == False:
        os.makedirs(os.path.join('dataset/ogbn_arxiv', config['encoder']))
        print('Create folder: dataset/ogbn_arxiv/{}'.format(config['encoder']))
    else:
        print('dataset/ogbn_arxiv/{} exists!'.format(config['encoder']))
    
    # Load raw ogbn-arxiv data
    raw_data = pd.read_csv(config['raw_text_path'], sep='\t')
    node2paper = pd.read_csv('dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz')
    train_idx = pd.read_csv('dataset/ogbn_arxiv/split/time/train.csv.gz', header=None)
    valid_idx = pd.read_csv('dataset/ogbn_arxiv/split/time/valid.csv.gz', header=None)
    test_idx = pd.read_csv('dataset/ogbn_arxiv/split/time/test.csv.gz', header=None)
    label = pd.read_csv('dataset/ogbn_arxiv/raw/node-label.csv.gz', header=None)
    
    # Preprocess & modify csv error
    raw_data.columns = ['Id', 'Title', 'Abstract']
    raw_data.iloc[0, 0] = 200971
    raw_data = raw_data.drop(len(raw_data)-1)

    train_idx = train_idx.iloc[:, 0].tolist()
    valid_idx = valid_idx.iloc[:, 0].tolist()
    test_idx = test_idx.iloc[:, 0].tolist()
    idx = {'train': train_idx, 'valid': valid_idx, 'test': test_idx}
    label = label.iloc[:, 0].tolist()

    # Create node_id->paper_id dict, paper_id->node_id dict
    paper2node_dict = {}
    node2paper_dict = {}
    for i, row in tqdm(node2paper.iterrows()):
        paper2node_dict[int(row[1])] = int(row[0])
        node2paper_dict[int(row[0])] = int(row[1])
    
    train, valid, test = tokenize(raw_data, paper2node_dict, idx, label)

    with open(config['train'], 'wb') as f:
        pickle.dump(train, f)
    with open(config['valid'], 'wb') as f:
        pickle.dump(valid, f)
    with open(config['test'], 'wb') as f:
        pickle.dump(test, f)
    with open(config['paper2node'], 'wb') as f:
        pickle.dump(paper2node_dict, f)
    with open(config['node2paper'], 'wb') as f:
        pickle.dump(node2paper_dict, f)
    
if __name__ == "__main__":
    main()

In [None]:
## Encode Features
from transformers import BertTokenizer, BertModel
import pickle
import argparse
import json
import pandas as pd
import torch
import os
from tqdm import tqdm

def freeze_bert_layers(model):
    """Freeze all bert layers to release GPU memory"""
    freeze_layers = 12
    for p in model.bert.embeddings.parameters():
        p.requires_grad = False
    model.bert.embeddings.dropout.p = 0.0
    for p in model.bert.pooler.parameters():
        p.requires_grad = False
    for idx in range(freeze_layers):
        for p in model.bert.encoder.layer[idx].parameters():
            p.requires_grad = False
        model.bert.encoder.layer[idx].attention.self.dropout.p = 0.0
        model.bert.encoder.layer[idx].attention.output.dropout.p = 0.0
        model.bert.encoder.layer[idx].output.dropout.p = 0.0
    return model

def encode_features(data, data_len, paper2node_dict, model):
    node_feats = torch.zeros((data_len, 768)).cuda()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = freeze_bert_layers(model)
    for i, row in tqdm(data.iterrows()):
        if row['Id'] not in paper2node_dict:
            continue
        context = "[CLS] " + row['Title'] + row['Abstract'] + " [SEP]"
        tokenize_context = tokenizer.tokenize(context)
        context_len = len(tokenize_context)

        if context_len > 512:
            tokenize_context = tokenize_context[:512]
    
        context_id = tokenizer.convert_tokens_to_ids(tokenize_context)
        context_id = torch.LongTensor(context_id).unsqueeze(0).cuda()
        feat = model.bert(context_id)[0].squeeze(0)[0]
        node_id = paper2node_dict[row['Id']]
        node_feats[node_id, :] = feat
        torch.cuda.empty_cache()
    
    return node_feats
    

def main():
    parser = argparse.ArgumentParser(description='Encode Node Features')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--node_feat_dir', type=str, default='node_feat',
                        help='Directory to the fine-tuned node features.')
    args = parser.parse_args([])

    device = torch.device('cuda:{}'.format(args.device) if torch.cuda.is_available()
                          else 'cpu')

    with open('config.json') as f:
        config = json.load(f)
    
    # Load data & model
    MODEL_PATH = config['bert_models'].format(args.seed)
    data = pd.read_csv(config['raw_text_path'], sep='\t')
    with open(config['node2paper'], 'rb') as f:
        node2paper_dict = pickle.load(f)
    with open(config['paper2node'], 'rb') as f:
        paper2node_dict = pickle.load(f)
    
    data.columns = ['Id', 'Title', 'Abstract']
    data.iloc[0, 0] = 200971
    data = data.drop(len(data)-1)
    model = torch.load(MODEL_PATH).to(device)
    
    # Create or check directory
    if os.path.isdir(args.node_feat_dir) == False:
        os.makedirs(args.node_feat_dir)
        print('Create folder: {}'.format(args.node_feat_dir))
    else:
        print('{} exists!'.format(args.node_feat_dir))

    node_feats = encode_features(data, len(node2paper_dict),
                                 paper2node_dict, model)

    torch.save(node_feats, config['node_features'].format(args.seed))

if __name__ == '__main__':
    main()

## 2. https://github.com/mpcrlab/MolecularTransformerEmbeddings

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.autograd import Variable
import math
import copy
import numpy as np
import random
import sys
import time
from math import ceil

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler

import argparse
import time


In [8]:
PRINTABLE_ASCII_CHARS = 95

_extra_chars = ["seq_start", "seq_end", "pad"]
EXTRA_CHARS = {key: chr(PRINTABLE_ASCII_CHARS + i) for i, key in enumerate(_extra_chars)}
ALPHABET_SIZE = PRINTABLE_ASCII_CHARS + len(EXTRA_CHARS)

def encode_char(c):
    return ord(c) - 32

def decode_char(n):
    return chr(n + 32)

def smiles_iupac_batch(instances):
    smiles_lens = torch.tensor([s[0].shape[0] + 1 for s in instances], dtype=torch.long)
    iupac_lens = torch.tensor([s[1].shape[0] + 1 for s in instances], dtype=torch.long)
    
    max_len_smiles = smiles_lens.max().item()
    max_len_iupac = iupac_lens.max().item()
    
    batch_smiles = torch.full((len(instances), max_len_smiles), ord(EXTRA_CHARS['pad']), dtype=torch.long)
    batch_iupac_in = torch.full((len(instances), max_len_iupac), ord(EXTRA_CHARS['pad']), dtype=torch.long)
    batch_iupac_out = torch.full((len(instances), max_len_iupac), ord(EXTRA_CHARS['pad']), dtype=torch.long)

    for i, instance in enumerate(instances):
        batch_smiles[i, 0] = ord(EXTRA_CHARS['seq_start'])
        batch_smiles[i, 1:smiles_lens[i]] = instance[0]

        batch_iupac_in[i, 0] = ord(EXTRA_CHARS['seq_start'])
        batch_iupac_in[i, 1:iupac_lens[i]] = instance[1]

        batch_iupac_out[i, iupac_lens[i]-1] = ord(EXTRA_CHARS['seq_end'])
        batch_iupac_out[i, 0:iupac_lens[i]-1] = instance[1]
    
    return batch_smiles, batch_iupac_in, batch_iupac_out, smiles_lens, iupac_lens

class SmilesIupacDataset(Dataset):
    def __init__(self, data_path, max_len=None):
        self.pairs = [line.strip("\n").split("\t") for line in open(data_path, "r")]
        self.max_len = max_len - 1 if max_len else 0
    
    def __len__(self):
        return len(self.pairs)
    
    def string_to_tensor(self, string):
        tensor = torch.tensor(list(map(encode_char, string)), dtype=torch.uint8)
        
        if self.max_len > 0:
            tensor = tensor[:self.max_len]
        
        return tensor
    
    def __getitem__(self, index):
        smiles, iupac = self.pairs[index]
        return self.string_to_tensor(smiles), self.string_to_tensor(iupac)

def get_dataloader(batch_size, data_path, max_len=256):
    dataset = SmilesIupacDataset(data_path, max_len=max_len)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=smiles_iupac_batch), dataset


In [9]:
# Transformer
class OneHotEmbedding(nn.Module):
    def __init__(self, alphabet_size):
        super().__init__()
        self.alphabet_size = alphabet_size
        self.embedding = nn.Embedding.from_pretrained(torch.eye(alphabet_size))
    def forward(self, x):
        return self.embed(x)
    
class Embedding(nn.Module):
    def __init__(self, alphabet_size, d_model):
        super().__init__()
        self.alphabet_size = alphabet_size
        self.d_model = d_model
        self.embed = nn.Embedding(alphabet_size, d_model)
    def forward(self, x):
        return self.embed(x)

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 6000, dropout = 0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(p=dropout)
        # create constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = \
                math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = \
                math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
 
    
    def forward(self, x):
        # make embeddings relatively larger
        x = x * math.sqrt(self.d_model)
        #add constant to embedding
        seq_len = x.size(1)
        pe = self.pe[:,:seq_len]
        pe = Variable(self.pe[:,:seq_len], requires_grad=False)
        if x.is_cuda:
            pe.cuda()
        x = x + pe
        #print(x.mean(), x)
        x = self.dropout(x)
        #x = F.dropout(x, p=0.1, training=self.training)
        #print(x.mean(), x)
        return x

    
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        
        self.eps = eps
    
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

def attention(q, k, v, d_k, mask=None, dropout=None):
    
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, -1e9)
    
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
        
    output = torch.matmul(scores, v)
    return output

    
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, d_model, dropout = 0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads
        
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    def forward(self, q, k, v, mask=None):
        
        bs = q.size(0)
        
        # perform linear operation and split into N heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
        
        # transpose to get dimensions bs * N * sl * d_model
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)
        

        # calculate attention using function we will define next
        scores = attention(q, k, v, self.d_k, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous()\
        .view(bs, -1, self.d_model)
        output = self.out(concat)
    
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
    
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x
    
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
    
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
        self.ff = FeedForward(d_model, dropout=dropout)

    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, \
        src_mask))
        x2 = self.norm_3(x)
        x = x + self.dropout_3(self.ff(x2))
        return x
    
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder(nn.Module):
    def __init__(self, alphabet_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedding(alphabet_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)
    
class Decoder(nn.Module):
    def __init__(self, alphabet_size, d_model, N, heads, dropout):
        super().__init__()
        self.N = N
        self.embed = Embedding(alphabet_size, d_model)
        self.pe = PositionalEncoder(d_model, dropout=dropout)
        self.layers = get_clones(DecoderLayer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

class Transformer(nn.Module):
    def __init__(self, alphabet_size, d_model, N, heads=8, dropout=0.1):
        super().__init__()
        self.encoder = Encoder(alphabet_size, d_model, N, heads, dropout)
        self.decoder = Decoder(alphabet_size, d_model, N, heads, dropout)
        self.out = nn.Linear(d_model, alphabet_size)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        #print("DECODER")
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output

    
def nopeak_mask(size, device):
    np_mask = torch.triu(torch.ones((size, size), dtype=torch.uint8), diagonal=1).unsqueeze(0)
    
    np_mask = np_mask == 0
    np_mask = np_mask.to(device)
    return np_mask

def create_masks(src, trg=None, pad_idx=ord(EXTRA_CHARS['pad']), device=None):
    src_mask = (src != pad_idx).unsqueeze(-2)

    if trg is not None:
        trg_mask = (trg != pad_idx).unsqueeze(-2)
        size = trg.size(1) # get seq_len for matrix
        np_mask = nopeak_mask(size, device)
        np_mask.to(device)
        trg_mask = trg_mask & np_mask
        return src_mask, trg_mask
    return src_mask

In [10]:
class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
    """
    Cosine annealing with restarts.
    Parameters
    ----------
    optimizer : torch.optim.Optimizer
    T_max : int
        The maximum number of iterations within the first cycle.
    eta_min : float, optional (default: 0)
        The minimum learning rate.
    last_epoch : int, optional (default: -1)
        The index of the last epoch.
    """

    def __init__(self,
                 optimizer,
                 T_max,
                 eta_min = 0.,
                 last_epoch = -1,
                 factor = 1.):
        # pylint: disable=invalid-name
        self.T_max = T_max
        self.eta_min = eta_min
        self.factor = factor
        self._last_restart = 0
        self._cycle_counter = 0
        self._cycle_factor = 1.
        self._updated_cycle_len = T_max
        self._initialized = False
        super(CosineWithRestarts, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        """Get updated learning rate."""
        # HACK: We need to check if this is the first time get_lr() was called, since
        # we want to start with step = 0, but _LRScheduler calls get_lr with
        # last_epoch + 1 when initialized.
        if not self._initialized:
            self._initialized = True
            return self.base_lrs

        step = self.last_epoch + 1
        self._cycle_counter = step - self._last_restart

        lrs = [
            (
                self.eta_min + ((lr - self.eta_min) / 2) *
                (
                    np.cos(
                        np.pi *
                        ((self._cycle_counter) % self._updated_cycle_len) /
                        self._updated_cycle_len
                    ) + 1
                )
            ) for lr in self.base_lrs
        ]

        if self._cycle_counter % self._updated_cycle_len == 0:
            # Adjust the cycle length.
            self._cycle_factor *= self.factor
            self._cycle_counter = 0
            self._updated_cycle_len = int(self._cycle_factor * self.T_max)
            self._last_restart = step

        return 

In [11]:

#from transformer import Transformer, create_masks
#from load_data import ALPHABET_SIZE, EXTRA_CHARS


parser = argparse.ArgumentParser()

parser.add_argument("--data_path", type=str, default="data/amino_acids.txt", help="Path to a text file with one SMILES string per line. These strings will be embedded.")
parser.add_argument("--checkpoint_path", type=str, default="checkpoints/pretrained.ckpt", help="Path to a binary file containing pretrained model weights.")
parser.add_argument("--max_length", type=int, default=256, help="Strings in the data longer than this length will be truncated.")
parser.add_argument("--embedding_size", type=int, default=512, help="Embedding size used in the pretrained Transformer.")
parser.add_argument("--num_layers", type=int, default=6, help="Number of layers used in the Encoder and Decoder of the pretrained Transformer.")

args = parser.parse_args([])

print(args)

def encode_char(c):
    return ord(c) - 32

def encode_smiles(string, start_char=EXTRA_CHARS['seq_start']):
    return torch.tensor([ord(start_char)] + [encode_char(c) for c in string], dtype=torch.long)[:args.max_length].unsqueeze(0)


smiles_strings = [line.strip("\n") for line in open(args.data_path, "r")]
print("Loaded {0} SMILES strings from {1}".format(len(smiles_strings), args.data_path))

print("Initializing Transformer...")
model = Transformer(ALPHABET_SIZE, args.embedding_size, args.num_layers).eval()
model = torch.nn.DataParallel(model)
print("Transformer Initialized.")

print("Loading pretrained weights from", args.checkpoint_path)
checkpoint = torch.load(args.checkpoint_path, map_location=torch.device("cpu"))
model.load_state_dict(checkpoint['state_dict'])
print("Pretrained weights loaded")
model = model.module.cpu()
encoder = model.encoder.cpu()

embeddings = []
with torch.no_grad():
    for smiles in smiles_strings:
        encoded = encode_smiles(smiles)
        mask = create_masks(encoded)
        embedding = encoder(encoded, mask)[0].numpy()
        embeddings.append(embedding)
        print("embedded {0} into {1} matrix.".format(smiles, str(embedding.shape)))
        
print("All SMILES strings embedded. Saving...")
filename = os.path.splitext(os.path.basename(args.data_path))[0]
out_dir = "embeddings/"
out_file = os.path.join(out_dir, filename + ".npz")

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

out_dict = {smiles: matrix for smiles, matrix in zip(smiles_strings, embeddings)}
np.savez(out_file, **out_dict)
print("Saved embeddings to", out_file)

Namespace(data_path='data/amino_acids.txt', checkpoint_path='checkpoints/pretrained.ckpt', max_length=256, embedding_size=512, num_layers=6)
Loaded 20 SMILES strings from data/amino_acids.txt
Initializing Transformer...
Transformer Initialized.
Loading pretrained weights from checkpoints/pretrained.ckpt
Pretrained weights loaded
embedded C(CC(C(=O)O)N)CN=C(N)N into (23, 512) matrix.
embedded C1=C(NC=N1)CC(C(=O)O)N into (23, 512) matrix.
embedded CCC(C)C(C(=O)O)N into (17, 512) matrix.
embedded CC(C)CC(C(=O)O)N into (17, 512) matrix.
embedded C(CCN)CC(C(=O)O)N into (18, 512) matrix.
embedded CSCCC(C(=O)O)N into (15, 512) matrix.
embedded C1=CC=C(C=C1)CC(C(=O)O)N into (25, 512) matrix.
embedded CC(C(C(=O)O)N)O into (16, 512) matrix.
embedded C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N into (33, 512) matrix.
embedded CC(C)C(C(=O)O)N into (16, 512) matrix.
embedded CC(C(=O)O)N into (12, 512) matrix.
embedded C(C(C(=O)O)N)C(=O)N into (20, 512) matrix.
embedded C(C(C(=O)O)N)C(=O)O into (20, 512) matrix

In [None]:
## train

parser = argparse.ArgumentParser()

parser.add_argument("--data_path", type=str, default="data/smiles_iupac_train_15k.tsv", help="Path to a csv containing pairs of strings for training.")
parser.add_argument("--checkpoint_path", type=str, default=None, help="Path to a binary file containing pretrained model weights. If not supplied, a random initialization will be used.")
parser.add_argument("--batch_size", type=int, default=24, help="How many samples to average in each training step. If more than one GPU is available, samples will be split across devices.")
parser.add_argument("--learning_rate", type=int, default=1e-4, help="Weight updates calculated during gradient descent will be multiplied by this factor before they are added to the weights.")
parser.add_argument("--max_length", type=int, default=256, help="Strings in the data longer than this length will be truncated.")
parser.add_argument("--embedding_size", type=int, default=512, help="Each SMILES string character will be embedded to a vector with this many elements.")
parser.add_argument("--num_layers", type=int, default=6, help="The Encoder and Decoder modules of the Transformer network will each have this many sequential layers.")
parser.add_argument("--num_epochs", type=int, default=10, help="In each epoch, every training sample will be used once.")
parser.add_argument("--cpu", action="store_true", help="Set this flag to run only on the CPU (no cuda needed).")

args = parser.parse_args([])

print(args)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if args.cpu:
    DEVICE = torch.device("cpu")
    
print("{0} GPUs available. Training with {1}.".format(torch.cuda.device_count(), DEVICE))

def print_progress(time, epoch, iters, loss):
    print(str(time), "minutes : epoch", str(epoch), ": batch", str(iters), ": loss =", str(loss))
    
def save(epoch, model, optimizer):
    checkpoint_name = "checkpoints/epoch_{0}.ckpt".format(epoch+1)
    torch.save({
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'lr': optimizer.param_groups[0]['lr']
                }, checkpoint_name)
    print("saved checkpoint at", checkpoint_name)
    
def train_epoch(epoch, model, dataloader, optimizer, sched=None):
    model.train()
    start = time.time()
    total_loss = 0
    print_every = max(1, int(len(dataloader) / 100.0))
    
    for i, (smiles, iupac_in, iupac_out, smiles_lens, iupac_lens) in enumerate(dataloader):
        smiles = smiles.to(DEVICE)
        iupac_in = iupac_in.to(DEVICE)
        iupac_out = iupac_out.to(DEVICE)
        
        optimizer.zero_grad()
        
        smiles_mask, iupac_mask = create_masks(smiles, iupac_in, device=DEVICE)
        preds = model(smiles, iupac_in, smiles_mask, iupac_mask)
        
        loss = torch.nn.functional.cross_entropy(preds.view(-1, preds.size(-1)), iupac_out.view(-1), ignore_index=ord(EXTRA_CHARS['pad']))
        #print(loss, preds)
        loss.backward()
        optimizer.step()
        if sched:
            sched.step()
            
        total_loss += loss.item()
        
        if (i+1) % print_every == 0:
            avg_loss = total_loss / float(print_every)
            print_progress((time.time() - start)//60, epoch+1, i+1, avg_loss)
            total_loss = 0
            
        #if (i+1) % SAVE_ITERS == 0:
        #    save(epoch, i+1, NAME, model, optimizer)
       
    avg_loss = total_loss / max(1, (i+1) % print_every)
    print_progress((time.time() - start)//60, epoch+1, i+1, avg_loss)
    save(epoch, model, optimizer)
    
    
dataloader, dataset = get_dataloader(args.batch_size, args.data_path, max_len=args.max_length)

print("Loaded {0} samples from {1}".format(len(dataset), args.data_path))

print("Initializing Transformer...")
model = Transformer(ALPHABET_SIZE, args.embedding_size, args.num_layers)
if torch.cuda.is_available() and not args.cpu:
    model = torch.nn.DataParallel(model)
model = model.to(DEVICE)
print("Transformer Initialized on device(s):", DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(0.9, 0.98), eps=1e-9)
sched = CosineWithRestarts(optimizer, T_max=len(dataloader))
epoch = 0

if args.checkpoint_path is not None:
    print("Loading pretrained weights from", args.checkpoint_path)
    checkpoint = torch.load(args.checkpoint_path)
    
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    assert optimizer.param_groups[0]['lr'] == checkpoint['lr']
    epoch = checkpoint['epoch'] + 1
    print("Pretrained weights loaded. Resuming training at epoch", epoch)

for i in range(epoch, epoch + args.num_epochs):
    print("Starting epoch", i+1)
    train_epoch(i, model, dataloader, optimizer, sched)