In [1]:
import numpy as np
from tqdm import tqdm, trange
from transformers import BertTokenizer, BertConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertForTokenClassification, AdamW
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, DataLoader
from transformers import get_linear_schedule_with_warmup
from seqeval.metrics import accuracy_score, classification_report

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.8 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 45.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.0 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
  

In [2]:
path = '/content/drive/MyDrive/Oscer/data/NERdata/train.tsv'
test = "/content/drive/MyDrive/Oscer/data/NERdata/test.tsv"

In [3]:
def dataToLabels(filename):
  datafile = open(filename)
  datanLabels = []
  sentence = []
  tags = []
  for line in datafile:
    if len(line)==0 or line[0]=="\n":
       if len(sentence) > 0:
         datanLabels.append(sentence)
         sentence = []
       continue
    splits = line.split('\t')
    sentence.append([splits[0],splits[-1].rstrip("\n")])
        
  if len(sentence) > 0:
     datanLabels.append(sentence)
     sentence = []
  return datanLabels

In [4]:
data = dataToLabels(path)

In [5]:
testData = dataToLabels(path)

In [6]:
def extractWordsNTags(data):
  sentences = [[word[0] for word in sentence] for sentence in data]
  labels = [[word[1] for word in sentence] for sentence in data]
  return sentences, labels

In [7]:
sentences, labels = extractWordsNTags(data)
Tags = list(set(labels[0]))
Tags.append('PAD')
tag2idx = {t: i for i, t in enumerate(Tags)}

In [24]:
Tags

['B', 'O', 'I', 'PAD']

In [9]:
model_path = '/content/drive/MyDrive/Oscer/SavedModel'

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
#tokenizer = AutoTokenizer.from_pretrained("fidukm34/biobert_v1.1_pubmed-finetuned-ner-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
MAX_LEN = 75
batch_size = 32

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [12]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [13]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [14]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")


In [15]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [16]:
train_inputs, valid_inputs, train_tags, valid_tags = train_test_split(input_ids, tags,
                                                            random_state=2021, test_size=0.1)
        
train_masks, valid_masks, _, _ = train_test_split (attention_masks, input_ids,
                                             random_state=2021, test_size=0.1)
                                             
train_inputs = torch.tensor(train_inputs)
valid_inputs = torch.tensor(valid_inputs)
train_tags = torch.tensor(train_tags)
valid_tags = torch.tensor(valid_tags)
train_masks = torch.tensor(train_masks)
valid_masks = torch.tensor(valid_masks)
        
train_data = TensorDataset(train_inputs, train_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(valid_inputs, valid_masks, valid_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [18]:
model.to(device)
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)


In [16]:
epochs = 2
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)


In [18]:
epochs = 1

In [19]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for epoch in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(batch_input_ids, token_type_ids=None,
                        attention_mask=batch_input_mask, labels=batch_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss = 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        batch_input_ids, batch_input_mask, batch_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            outputs = model(batch_input_ids, token_type_ids=None,
                            attention_mask = batch_input_mask, labels=batch_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [Tags[p_i] for predicted, actual in zip(predictions, true_labels)
                                 for p_i, l_i in zip(predicted, actual) if Tags[l_i] != "PAD"]
    valid_tags = [Tags[l_i] for l in true_labels
                                  for l_i in l if Tags[l_i] != "PAD"]
    print("Validation Accuracy: {}\n".format(accuracy_score(pred_tags, valid_tags)))
  


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Average train loss: 0.34252139882966753


Epoch: 100%|██████████| 1/1 [02:04<00:00, 124.35s/it]

Validation loss: 0.023984065125285482
Validation Accuracy: 0.9952752444786287






In [22]:
model.save_pretrained('/content/drive/MyDrive/Oscer/SavedModel')

In [19]:
test = "/content/drive/MyDrive/Oscer/data/NERdata/test.tsv"

In [20]:
def testDatatoSentences(dataPath):
    fileContents = open(dataPath)
    sentenceCollection = []
    sentenceTags = []
    tagCollection = []
    flag = 1
    for line in fileContents:
      if flag == 1:
        firstSplit = line.split('\t')
        Word = firstSplit[0]
        tag = firstSplit[-1].rstrip('\n')
        sentenceTags.append(tag)
        flag = 0
        continue
      if line == '\n':
        sentenceCollection.append(Word)
        tagCollection.append(sentenceTags)
        sentenceTags = []
        flag = 1
        continue
      splitted = line.split('\t')
      subsqWord = splitted[0]
      tag = splitted[-1].rstrip('\n')
      Word = Word + " " + subsqWord
      sentenceTags.append(tag)
      
    return sentenceCollection, tagCollection


In [21]:
testSentences, testLabels = testDatatoSentences(test)

In [22]:
#test_sentence = "Identification of APC2, a homologue of the adenomatous polyposis coli tumour suppressor"
test_sentence = "Clustering of missense mutations in the ataxia-telangiectasia gene in a sporadic T-cell leukaemia."
#Ataxia-telangiectasia (A-T) is a recessive multi-system disorder caused by mutations in the ATM gene at 11q22-q23 (ref.3). The risk of cancer, especially lymphoid neoplasias, is substantially elevated in A-T patients and has long been associated with chromosomal instability"

In [23]:
tokenCollection = []
labelCollection = []
test_loss = 0
model.to(device)
for testSentence in testSentences:
  tokenized_sentence = tokenizer.encode(testSentence)
  input_ids = torch.tensor([tokenized_sentence]).to(device)
  with torch.no_grad():
     output = model(input_ids)
  label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
  
  # join bpe split tokens
  tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
  test_loss += output[0].mean().item()
  new_tokens, new_labels = [], []
  for token, label_idx in zip(tokens, label_indices[0]):
    if token == '[CLS]' or token == '[SEP]':
      continue
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(Tags[label_idx])
        new_tokens.append(token)
  tokenCollection.append(new_tokens)
  labelCollection.append(new_labels)

print("Test Loss: {}".format(test_loss/len(testSentences)))
print("Test Accuracy: {}".format(accuracy_score(testLabels, labelCollection)))
print("Classification Report:\n {}".format(classification_report(testLabels, labelCollection)))


Test Loss: -0.16185209870417702
Test Accuracy: 0.9804874066212189
Classification Report:
               precision    recall  f1-score   support

           _       0.78      0.83      0.80       960

   micro avg       0.78      0.83      0.80       960
   macro avg       0.78      0.83      0.80       960
weighted avg       0.78      0.83      0.80       960



In [29]:
print("{}\n{}\n{}".format(testLabels[0], labelCollection[0], tokenCollection[0]))

['O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'O']
['Clustering', 'of', 'missense', 'mutations', 'in', 'the', 'ataxia', '-', 'telangiectasia', 'gene', 'in', 'a', 'sporadic', 'T', '-', 'cell', 'leukaemia', '.']


In [32]:
import csv

In [31]:
results = "/content/drive/MyDrive/Oscer/data/NERdata/results.tsv"

In [33]:
with open(results, 'wt') as out_file:
  tsv_writer = csv.writer(out_file, delimiter='\t')
  tsv_writer.writerow(['Model_Prediction', 'Tags', 'Tokens'])
  for predictionTags, labelTags, tokens in zip(labelCollection,  testLabels, tokenCollection):
    for predictionTag, labelTag, token in zip(predictionTags, labelTags, tokens):
      tsv_writer.writerow([predictionTag, labelTag, token])
    

In [30]:
for predictionTag, labelTag, token in zip(labelCollection[0],  testLabels[0], tokenCollection[0]):
  print("{}\t{}\t{}\n".format(predictionTag, labelTag, token))

O	O	Clustering

O	O	of

O	O	missense

O	O	mutations

O	O	in

O	O	the

B	B	ataxia

I	I	-

I	I	telangiectasia

O	O	gene

O	O	in

O	O	a

O	B	sporadic

B	I	T

I	I	-

I	I	cell

I	I	leukaemia

O	O	.



In [25]:
#!python3 /content/drive/MyDrive/Oscer/train.py /content/drive/MyDrive/Oscer/data/NERdata/train.tsv

Downloading: 100% 416M/416M [00:11<00:00, 37.7MB/s]
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not in