In [None]:
!pip install transformers

In [None]:
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

#choose a model 
# model_name = 'bert-base-uncased'
model_name = 'allenai/scibert_scivocab_uncased'
# model_name = 'google/electra-base-discriminator'
# model_name = 'allenai/longformer-base-4096'
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

import pandas as pd

import re
import time
import random

from tqdm import tqdm

seed_val = 42

# device = torch.device('cpu')

random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
PATH = '/content/drive/MyDrive/GRN/CAMDA/'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

CAMDA challenge data


In [None]:
data_raw = pd.read_csv(PATH + 'DILI_data.csv')
data = data_raw.sample(frac=1)
data['Abstract'] = data['Abstract'].fillna("")
data['Title'] = data['Title'].fillna("")
data['Documents'] = data['Title'].map(str) + '. ' + data['Abstract'].map(str)
docs = [re.sub('[A-Z]+\: |\[|\]', '', doc) for doc in data['Documents'].to_list()]
labels = data['Label'].to_list()

In [None]:
# Load the tokenizer.
print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(model_name,  use_fast=True)

#define maximum sentences length
counter=0
max_len = 0
# For every sentence...
for doc in docs:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(doc, add_special_tokens=True)

    # Update the maximum sentence length.
    l = len(input_ids)
    max_len = max(max_len, l)
    if l>512:
        counter+=1
print('Amount of documents with more then 512 tokens - ', counter)
print('Maximum length of document - ', max_len)

Loading tokenizer...


Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

Amount of documents with more then 512 tokens -  35
Maximum length of document -  1639


# Data preparation


In [None]:
encoded_dict = tokenizer(
                    docs,                      # Sentence to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    max_length = 512,           # Pad & truncate all sentences.
                    padding = 'max_length',
                    truncation = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',     # Return pytorch tensors.
                   )
#convert labels list to pytorch tensor
labels = torch.tensor(labels)

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(encoded_dict['input_ids'], encoded_dict['attention_mask'], labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning transformer on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 3

# Create the DataLoaders for our training and validation sets.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

5,066 training samples
  563 validation samples


# Model preparation

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = len(set(labels)), # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.to(device)

# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The Transformer model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

The Transformer model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (31090, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias        

# Training


In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

#number of training epochs
epochs = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print()
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(tqdm(train_dataloader)):

        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        # Accumulate the training loss over all of the batches
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = time.time() - t0

    print()
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print()
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_true_positives = 0
    total_eval_result_positives = 0
    total_eval_positives = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader):
        
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu()
        label_ids = b_labels.to('cpu')

        # Calculate the socre for this batch of test sentences, and
        # accumulate it over all batches.
        results = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (results == label_ids).float().mean()
        total_eval_true_positives +=  torch.sum( torch.logical_and(results, label_ids))
        total_eval_result_positives += torch.sum((results>0))
        total_eval_positives += torch.sum((label_ids>0))
       

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_precision = total_eval_true_positives / total_eval_result_positives
    avg_val_recall = total_eval_true_positives / total_eval_positives
    print("  Accuracy: {0:.5f}".format(avg_val_accuracy))
    print("  Precision: {0:.5f}".format(avg_val_precision))
    print("  Recall: {0:.5f}".format(avg_val_recall))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = time.time() - t0
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Valid. Precision': avg_val_precision,
            'Vald. Recall': avg_val_recall,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print()
print("Training complete!")

print("Total training took {:} s".format(time.time()-total_t0))


# Validation

In [None]:
results = []
val_labels = []     
for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2]
    val_labels.append(b_labels)
    with torch.no_grad():        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]  
    b_results = torch.argmax(logits, dim=-1)
    results.append(b_results)

results = torch.cat(results, dim=0).cpu()
val_labels = torch.cat(val_labels, dim=0)
total_eval_accuracy = (results == val_labels).float().mean()
total_eval_precision = torch.sum( torch.logical_and(results, val_labels) ) / torch.sum(results)
total_eval_recall = torch.sum( torch.logical_and(results, val_labels) ) / torch.sum(val_labels)
FP_rate =  torch.sum(((results - val_labels) == 1))/torch.sum(val_labels)
FN_rate = torch.sum(((results - val_labels) == -1))/torch.sum(val_labels == 0)

print("  Accuracy: {0:.5f}".format(total_eval_accuracy))
print("  Precision: {0:.5f}".format(total_eval_precision))
print("  Recall: {0:.5f}".format(total_eval_recall))

# Testing

In [None]:
# model_for_val = '/content/drive/MyDrive/GRN/CAMDA/electra_dili.pt'
model_for_val = '/content/drive/MyDrive/GRN/CAMDA/longformer_dili.pt'
with open(model_for_val, 'rb') as f:
    model = torch.load(f).to(device)

In [None]:
model_name = 'allenai/longformer-base-4096'
# model_name = 'google/electra-base-discriminator'

tokenizer = AutoTokenizer.from_pretrained(model_name,  use_fast=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=694.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [None]:
# data = pd.read_csv(PATH + 'DILI_val.tsv', sep = '\t')
# # data_raw = pd.read_csv(PATH + 'DILI_data.csv')
# # data = data_raw.sample(frac = 1)

# data['Documents'] =  data['Abstract'].map(str)  #data['Title'].map(str) + '. ' +
# docs = [re.sub('[A-Z]+\: |\[|\]', '', doc) for doc in data['Documents'].to_list()]

encoded_dict = tokenizer(
                    docs,                      # Sentence to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    max_length = 512,           # Pad & truncate all sentences.
                    padding = 'max_length',
                    truncation = True,
                    return_attention_mask = True,   # Construct attn. masks.
                    return_tensors = 'pt',     # Return pytorch tensors.
                   )

test_dataset = TensorDataset(encoded_dict['input_ids'], encoded_dict['attention_mask'])

test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = 8
        )  
results = []     
for batch in tqdm(test_dataloader):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    with torch.no_grad():        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]  

    b_results = torch.argmax(logits, dim=-1)
    results.append(b_results.cpu())

results = torch.cat(results, dim=0).cpu()

100%|██████████| 620/620 [01:34<00:00,  6.58it/s]


In [None]:
val_labels = torch.tensor(data['Label'].to_list())
results = torch.tensor(results)
total_eval_accuracy = (results == val_labels).float().mean()
total_eval_precision = torch.sum( torch.logical_and(results, val_labels) ) / torch.sum(results)
total_eval_recall = torch.sum( torch.logical_and(results, val_labels) ) / torch.sum(val_labels)
print("  Accuracy: {0:.5f}".format(total_eval_accuracy))
print("  Precision: {0:.5f}".format(total_eval_precision))
print("  Recall: {0:.5f}".format(total_eval_recall))

  Accuracy: 0.99085
  Precision: 0.98893
  Recall: 0.99306


  


In [None]:
comp_data = pd.read_csv('/content/igor.stepanov2000_at_gmail_com__test_submission_l.csv')
comp_labels = torch.tensor(comp_data['Labels'].to_list())
results = torch.tensor(results)
(results == comp_labels).float().mean()

  This is separate from the ipykernel package so we can avoid doing imports until


tensor(0.9725)

In [None]:
data['Labels'] = results
data.to_csv('igor.stepanov2000_at_gmail_com__test_submission_e.csv')