In [1]:
# Set a seed value
seed_value=42
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)
# 4. Set `pytorch` pseudo-random generator at a fixed value
import torch
torch.manual_seed(seed_value)
torch.backends.cudnn.deterministic = True

In [2]:
# data_path = "/sbksvol/xiang/sbks_gitlab/top-model/BIOBERT/NER/data/raw/Cellline"
data_path = "/sbksvol/gaurav/NER_data/"
# saved_model_path = "/sbksvol/gaurav/biobert_v1.0_pubmed_pmc/"
# saved_model_path = "/sbksvol/gaurav/biobert_v1.1_pubmed/"
# pretrained_model_path = "/sbksvol/gaurav/biobert_v1.0_pubmed_pmc/biobert_model.ckpt"
# config_path = "/sbksvol/gaurav/biobert_v1.0_pubmed_pmc/config.json"
# config_path = "/sbksvol/gaurav/biobert_v1.1_pubmed/config.json"
# vocab_path = "/sbksvol/gaurav/biobert_v1.1_pubmed/vocab.txt"

In [3]:
# from convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch

In [4]:
ENT = "Cellline"
DATASET = "jnlpba"

In [5]:
import os
data_dir = os.path.join(data_path, ENT, DATASET)

In [6]:
# !pip install --upgrade pip
# !pip install transformers
# !pip install seqeval

## Prepare Data

In [7]:
import json

files = ["train", "dev", "test"]


def convert(lines, f):
    tokens_ = []
    tags_ = []

    data = {"words": [], "ner": []}

    for line in lines:
        line = line.strip()
        if len(line) == 0:
            data["words"].append(tokens_)
            data["ner"].append(tags_)
            tokens_ = []
            tags_ = []
        else:
            token, tag = line.split("\t")
            if len(tag) > 1:
                tag = tag.split("-")[0]
            tokens_.append(token.strip())
            tags_.append(tag.strip())
            
    if len(tokens_) > 0:
        data["words"].append(tokens_)
        data["ner"].append(tags_)

    return data

In [8]:
def writer(data, fp, add_str=""):

    for (tokens, tags) in zip(data["words"], data["ner"]):
        for (token, tag) in zip(tokens, tags):
            if tag == "B" or tag == "I":
                tag += add_str
            fp.write("{}\t{}\n".format(token, tag))
        fp.write("\n")

In [9]:
## convert all tsv files to txt

all_data = {}
for f in files:
    with open(os.path.join(data_dir, f + ".tsv"), "r") as fp:
        lines = fp.readlines()
        all_data[f] = convert(lines, fp)
    fp = open(os.path.join(data_dir, f + ".txt"), "w")
    writer(all_data[f], fp)
    fp.close()

In [10]:
num_train_sents = len(all_data["train"]["words"])
num_dev_sents = len(all_data["dev"]["words"])
num_test_sents = len(all_data["test"]["words"])
print(num_train_sents, num_dev_sents, num_test_sents)

11167 1834 5548


In [11]:
import pandas as pd

# add the index to keep track of sentences
train_tuples = []
for i,(tokens,tags) in enumerate(zip(all_data["train"]["words"],all_data["train"]["ner"])):
    for token,tag in zip(tokens,tags):
        train_tuples.append([i,token,tag])

test_tuples = []
for i,(tokens,tags) in enumerate(zip(all_data["test"]["words"],all_data["test"]["ner"])):
    for token,tag in zip(tokens,tags):
        test_tuples.append([i,token,tag])
    
train_df = pd.DataFrame(train_tuples, columns=['sentence_id', 'words', 'labels'])
test_df = pd.DataFrame(test_tuples, columns=['sentence_id', 'words', 'labels'])

In [12]:
print(test_df.head(10))

   sentence_id              words labels
0            0         Reactivity      O
1            0                 of      O
2            0        lymphocytes      B
3            0                 to      O
4            0                  a      O
5            0       progesterone      O
6            0  receptor-specific      O
7            0         monoclonal      O
8            0           antibody      O
9            0                  .      O


In [13]:
import numpy as np

In [14]:
# a list that has all possible labels 
labels = np.sort(train_df['labels'].unique()).tolist()
label_map =  {i: label for i, label in enumerate(labels)}
num_labels = len(labels)
print(labels)

['B', 'I', 'O']


## Model Definition

In [15]:
# tf_checkpoint_path = "/sbksvol/gaurav/biobert_v1.0_pubmed_pmc/model.ckpt.index"
# bert_config_file = config_path
# pytorch_dump_path = "/sbksvol/gaurav/biobert_v1.0_pubmed_pmc_pytorch/pytorch_model.bin"

# convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path)

In [16]:
# saved_model_path = "/sbksvol/gaurav/biobert_v1.0_pubmed_pmc_pytorch/"
# config_path = "/sbksvol/gaurav/biobert_v1.0_pubmed_pmc_pytorch/config.json"

In [17]:
model_args = dict()

# Path to pretrained model or model identifier from huggingface.co/models
model_args['model_name_or_path'] = 'dmis-lab/biobert-base-cased-v1.1'
# saved_model_path
# saved_model_path
# pytorch_dump_path
# 'dmis-lab/biobert-base-cased-v1.1'

# Where do you want to store the pretrained models downloaded from s3
model_args['cache_dir'] = "/sbksvol/gaurav/NER_out/"

# we skip basic white-space tokenization by passing do_basic_tokenize = False to the tokenizer
model_args['do_basic_tokenize'] = False


data_args = dict()

data_args['data_dir'] = data_dir

# "The maximum total input sequence length after tokenization. Sequences longer "
# "than this will be truncated, sequences shorter will be padded."
data_args['max_seq_length'] = 256

# Overwrite the cached training and evaluation sets
# this means the model does not have to tokenize/preprocess and cache the data each time it's called
# this can be made different for each NerDataset (training NerDataset, testing NerDataset)
data_args['overwrite_cache'] = True

In [18]:
import transformers
# code has been tested with transformers 4.4.2
transformers.__version__

'4.4.2'

In [19]:
import torch
# device = torch.device('cpu')

In [20]:
from transformers import (
    BertConfig,
    BertForTokenClassification,
    BertTokenizer
)

config = BertConfig.from_pretrained(
    model_args['model_name_or_path'],
    num_labels=num_labels,
    id2label=label_map,
    label2id={label: i for i, label in enumerate(labels)},
    cache_dir=model_args['cache_dir']
)

# we skip basic white-space tokenization by passing do_basic_tokenize = False to the tokenizer
tokenizer = BertTokenizer.from_pretrained(
    model_args['model_name_or_path'],
    cache_dir=model_args['cache_dir']
#     ,do_basic_tokenize = model_args['do_basic_tokenize']
)

model = BertForTokenClassification.from_pretrained(
    model_args['model_name_or_path'],
    config=config
    ,cache_dir=model_args['cache_dir']
#     ,from_tf=True
)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [21]:
model.train()
# model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

## Create Dataset Objects

In [22]:
data_utils_path = "/sbksvol/gaurav/transformers/examples/token-classification/"

In [23]:
import sys
if data_utils_path not in sys.path:
    sys.path.append(data_utils_path)

In [24]:
from utils_ner import NerDataset, Split
# %reset_selective -f "utils_ner"
# NerDataset.__init__

In [25]:
train_dataset = NerDataset(
  data_dir=data_args['data_dir'],
  tokenizer=tokenizer,
  labels=labels,
  model_type=config.model_type,
  max_seq_length=data_args['max_seq_length'],
  overwrite_cache=data_args['overwrite_cache'], # True
  mode=Split.train)

In [26]:
eval_dataset = NerDataset(
  data_dir=data_args['data_dir'],
  tokenizer=tokenizer,
  labels=labels,
  model_type=config.model_type,
  max_seq_length=data_args['max_seq_length'],
  overwrite_cache=data_args['overwrite_cache'],
  mode=Split.dev)

In [27]:
print(train_dataset.__len__(), eval_dataset.__len__())

11166 1833


## Create Trainer

In [28]:
from transformers.hf_argparser import HfArgumentParser
from transformers import TrainingArguments
from transformers import Trainer

In [29]:
import json

training_args_dict = {
    'output_dir' : "model_output1/",
    'num_train_epochs' : 10,
    'train_batch_size': 32,
    'seed':seed_value,
#     'save_strategy': "epoch",
    "evaluation_strategy": "epoch"
    ,"load_best_model_at_end": True
}

with open('training_args.json', 'w') as fp:
    json.dump(training_args_dict, fp)
    
parser = HfArgumentParser(TrainingArguments)
# this function returns a tuple so we get the first item in the tuple since we only passed one arguement type "TrainingArguments"
training_args = parser.parse_json_file(json_file="training_args.json")[0]

In [30]:
# param_list_orig = []
# for param in model.parameters():
#     param_list_orig.append(param)

In [31]:
# len(param_list_orig)

In [32]:
# import pickle

# with open(os.path.join("model_output1", "param_list_orig.pickle"), "wb") as fp:
#     pickle.dump(param_list_orig, fp)

## Train

In [33]:
# Initialize our Trainer
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=eval_dataset
)

trainOutput = trainer.train()


# torch.save(model, os.path.join(training_args_dict["output_dir"], "pytorch_model.bin"))
# trainer.save_model()
# model.save_pretrained(training_args_dict["output_dir"])

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,0.0297,0.035633,10.7645,170.282
2,0.0195,0.026418,11.0569,165.779
3,0.0108,0.042056,11.0578,165.765
4,0.0067,0.0441,11.0695,165.59
5,0.0036,0.046253,10.9864,166.842
6,0.0025,0.059593,11.0546,165.814
7,0.0014,0.064529,10.9828,166.897
8,0.0008,0.071791,11.0167,166.383
9,0.0007,0.071113,10.9271,167.748
10,0.0002,0.073964,10.9924,166.752


In [34]:
trainOutput.global_step

13960

In [35]:
list(model.parameters())[-1].data

tensor([-0.0027, -0.0008,  0.0016], device='cuda:0')

In [36]:
# #### Training args ####
# training_args_dict = {
#     'output_dir' : "model_output1",
#     'num_train_epochs' : 10,
#     'train_batch_size': 32,
#     'seed':seed_value
# }

# with open('training_args.json', 'w') as fp:
#     json.dump(training_args_dict, fp)
    
# parser = HfArgumentParser(TrainingArguments)
# # this function returns a tuple so we get the first item in the tuple since we only passed one arguement type "TrainingArguments"
# training_args = parser.parse_json_file(json_file="training_args.json")[0]


# saved_path = "model_output1/checkpoint-150"
# #### Config ####
# config = BertConfig.from_pretrained(
#     saved_path,
#     num_labels=num_labels,
#     id2label=label_map,
#     label2id={label: i for i, label in enumerate(labels)},
#     cache_dir=model_args['cache_dir']
# )

# #### Model ####

# reloaded_model = BertForTokenClassification.from_pretrained(
#     saved_path,
#     config=config,
#     cache_dir=model_args['cache_dir']
# )

# # model = torch.load(os.path.join(training_args_dict["output_dir"], "pytorch_model.bin"))

In [37]:
# reloaded_model.to('cuda')
# reloaded_model.train()

In [38]:
# list(reloaded_model.parameters())[-1].data

In [39]:
# trainer = Trainer(
#   model=reloaded_model,
#   args=training_args,
#   train_dataset=train_dataset,
#   eval_dataset=eval_dataset
# )

# trainer.train("checkpoint-150")

In [40]:
# list(reloaded_model.parameters())[-1].data

In [41]:
# param_list_new = []
# for param in model.parameters():
#     param_list_new.append(param)

In [42]:
# with open(os.path.join("model_output1", "param_list_orig.pickle"), "rb") as fp:
#     param_list_orig = pickle.load(fp)

In [43]:
# for param_orig, param_new in zip(param_list_orig, param_list_new):
#     assert param_orig.shape == param_new.shape
#     print(torch.norm(param_orig.data - param_new.to('cpu').data))

In [44]:
# model = reloaded_model

## Clean-up

In [45]:
# torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))
import gc
# del variables
gc.collect()

140

## Prepare test data

In [46]:
import numpy as np
from torch import nn

In [47]:
model.eval()
# we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt 
test_dataset = NerDataset(
  data_dir=data_args['data_dir'],
  tokenizer=tokenizer,
  labels=labels,
  model_type=config.model_type,
  max_seq_length=data_args['max_seq_length'],
  overwrite_cache=True,
  mode=Split.test)

# last layer output/activation has the shape of (batch_size, seq_len,num_of_labels)
output, label_ids, metrics = trainer.predict(test_dataset)
preds = np.argmax(output, axis=2)
num_test_samples, seq_len = preds.shape

In [48]:
print(test_dataset.__len__())

5547


In [49]:
# list of token-level predictions shape = (num_test_samples, seq_len)
preds_list = [[] for _ in range(num_test_samples)]
for i in range(num_test_samples):
#     if i == 184:
#         count = 0
    for j in range(seq_len):
#         if i == 184 and label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
#             count += 1
#             print(j, end=' ')
        # ignore pad_tokens
        if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
            preds_list[i].append(label_map[preds[i][j]])

In [50]:
def sentences_combiner(df):
    # 'words' and 'labels' are the column names in the CSV file
    tupple_function = lambda x: [(w, t) for w, t in zip(x["words"].values.tolist(),
                                                      x["labels"].values.tolist())]
    grouped = df.groupby("sentence_id").apply(tupple_function)
    return [s for s in grouped]

testing_sentences = sentences_combiner(test_df)
test_labels = [[w[1] for w in s] for s in testing_sentences]
test_tokens = [[w[0] for w in s] for s in testing_sentences]

# reconstruct full sentences from lists of (token,label) tuples
# test_reconstructed = [" ".join([w[0] for w in s] ) for s in testing_sentences]

In [51]:
# make sure all test and pred sentences have the same length

test_labels_new = []
preds_list_new = []

for i, x in enumerate(test_labels):
    if len(x) == len(preds_list[i]):
        test_labels_new.append(x)
        preds_list_new.append(preds_list[i])
    else:
        print("ABORT ", i)
#         print(test_df.loc[test_df['sentence_id'] == i])
#         assert list(test_df.loc[test_df['sentence_id'] == i]['labels']) == x

ABORT  2406


In [52]:
# print(list(test_df.loc[test_df['sentence_id'] == 184]['words']))

In [53]:
def token_level(tg):
    gt = [np.array(x) == tg for x in test_labels]
    pred = [np.array(y) == tg for y in preds_list]
    
    total = sum([np.sum(x) for x in gt])
    
    correct = sum([np.sum(x&y) for x, y in zip(gt, pred)])
    
    print("correct = {}, total = {}".format(correct, total))
    
    acc = correct/total
    
    return acc

In [54]:
# token_level("B")

In [55]:
# !pip install seqeval

## Get entity level scores

In [56]:
from seqeval.metrics import f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels_new, preds_list_new)))
print(classification_report(test_labels_new, preds_list_new))

F1-score: 66.3%
              precision    recall  f1-score   support

           _       0.63      0.70      0.66      1117

   micro avg       0.63      0.70      0.66      1117
   macro avg       0.63      0.70      0.66      1117
weighted avg       0.63      0.70      0.66      1117



## Write Predictions to file

In [57]:
# # write the predictions to a file
# fp = open(os.path.join(data_dir, "pred.tsv"), "w")
# writer({"words": test_tokens, "ner": preds_list}, fp, "-NP")
# fp.close()

In [58]:
# # re-write the test file so that B and I tags have "-NP" attached
# fp = open(os.path.join(data_dir, "test_withnp.tsv"), "w")
# writer({"words": test_tokens, "ner": test_labels}, fp, "-NP")
# fp.close()

## Get token-level scores per tag type

In [59]:
# token_level("B")

In [60]:
# token_level("I")

In [61]:
# token_level("O")

## Copy test_withnp.tsv and pred.tsv to local machine

In [62]:
# kubectl cp (pod-name) \
# path_str1 = "/sbksvol/gaurav/NER_Medium_data/{0}/{1}/pred.tsv /Users/apple/Desktop/test/tmp2/conll2brat/data/huggingface/{0}/pred/{1}.conll.test".format(ENT, DATASET)

In [63]:
# print(path_str1)

In [64]:
# kubectl cp (pod-name) \
# path_str2 = "/sbksvol/gaurav/NER_Medium_data/{0}/{1}/test_withnp.tsv /Users/apple/Desktop/test/tmp2/conll2brat/data/huggingface/{0}/true/{1}.conll.test".format(ENT, DATASET)

In [65]:
# print(path_str2)

## Top-Models

In [66]:
# from transformers import BertModel
# import torch.nn as nn
# import torch.nn.functional as F
# from torch.utils.data import DataLoader

In [67]:
# import torch.nn.init
# from torch.autograd import Variable
# from torch.nn.parameter import Parameter
# import torch.optim as optim

In [68]:
# # base class for your deep neural networks. It implements the training loop (train_net).
# # You will need to implement the "__init__()" function to define the networks
# # structures and "forward()", to propagate your data, in the following problems.
# class DNN(nn.Module):
#     def __init__(self):
#         super(DNN, self).__init__()
#         pass
    
#     def forward(self, x):
#         raise NotImplementedError
    
#     def train_net(self, epochs=9, batchSize=16):
#         criterion = nn.CrossEntropyLoss()
#         optimizer = optim.Adam(self.parameters(), lr = 1e-5)
#         data_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True, collate_fn=collate_fn_ner)
        
        
#         for epoch in range(epochs):
#             self.train()  # set network in training mode
#             print("Epoch # ", epoch)
#             for i, (X_input_ids, X_attention_mask, X_token_type_ids, y_label_ids) in enumerate(data_loader):
#                 print("Step # ", i)

#                 # move tensors to GPU
#                 X_input_ids, X_attention_mask, X_token_type_ids, y_label_ids = \
#                 X_input_ids.cuda(), X_attention_mask.cuda(), X_token_type_ids.cuda(), y_label_ids.cuda()

#                 # Now train the model using the optimizer and the batch data
#                 optimizer.zero_grad()
#                 output = self.forward((X_input_ids, X_attention_mask, X_token_type_ids))
#                 output = torch.transpose(output, 1, 2)
#                 loss = criterion(output.float(), y_label_ids)
#                 loss.backward()
#                 optimizer.step()
                
# #                 del X_input_ids
# #                 del X_attention_mask
# #                 del X_token_type_ids
# #                 del y_label_ids
# #                 torch.cuda.empty_cache()
#                 # xxxxxxxxxx End of your code, don't change anything else here xxxxxxxxxx

#             self.eval()  # set network in evaluation mode
#             ckpt_path = os.path.join(model_args['cache_dir'], f"model-{init_step + train_step_count}")
#             torch.save(self.state_dict(), ckpt_path)
# #             print ('Epoch:%d Accuracy: %f'%(epoch+1, test(testData, testLabels, self))) 
    
#     def __call__(self, x):
#         inputs = x
#         prediction = self.forward(inputs)
#         return prediction
    

In [69]:
# config

In [70]:
# class BertNER(DNN):

#     def __init__(self, model_path, top_model, is_train=False):
#         super(BertNER, self).__init__()
#         self.bert = BertModel.from_pretrained(model_path, from_tf=True)
#         self.top_model = top_model

#     def forward(self, x):
#         X_input_ids, X_attention_mask, X_token_type_ids = x
#         outputs = self.bert(input_ids=X_input_ids, attention_mask=X_attention_mask, token_type_ids=X_token_type_ids,
#                             return_dict=True, output_hidden_states=True)
        
#         # Experiment with sum of last four hidden states
#         last_layer = outputs.hidden_states[-1]
#         y = self.top_model(last_layer)
#         return y

In [71]:
# class ThreeHiddenLayer(DNN):
#     def __init__(self, hidden_units_list=[500, 250, 125], num_classes=3):
#         super(ThreeHiddenLayer, self).__init__()
#         hid1, hid2, hid3 = hidden_units_list
#         self.embedding_size = 768
#         self.fc1 = nn.Linear(self.embedding_size, hid1)
#         self.fc2 = nn.Linear(hid1, hid2)
#         self.fc3 = nn.Linear(hid2, hid3)
#         self.fc4 = nn.Linear(hid3, num_classes)

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = F.relu(self.fc3(x))
#         x = self.fc4(x)
#         return x
    

In [72]:
# def collate_fn_ner(batch):
#     X_input_ids = torch.LongTensor([x.input_ids for x in batch])
#     X_token_type_ids = torch.LongTensor([x.token_type_ids for x in batch])
#     X_attention_mask = torch.FloatTensor([x.attention_mask for x in batch])
    
#     y_label_ids = [x.label_ids for x in batch]

#     return X_input_ids, X_attention_mask, X_token_type_ids, torch.LongTensor(y_label_ids)

In [73]:
# threeHiddenLayerClassifier = ThreeHiddenLayer()
# bertNER = BertNER(saved_model_path, threeHiddenLayerClassifier)

In [74]:
# bertNER = bertNER.cuda()

In [75]:
# bertNER.train_net()

In [76]:
# torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))
# # import gc
# # del variables
# # gc.collect()

## Train top-model using the Trainer API

In [77]:
# list(model.parameters())[-1].data

In [78]:
# for name, param in model.named_parameters():
#     print(name, param.shape)

In [79]:
# Gene, bioinfer
# seed = 42 -> 0.86      0.87      0.87
# seed = 0 -> 0.85      0.89      0.87
# seed = 13 -> 0.85      0.86      0.86

In [None]:
# Cellline, jnlpba
# seed = 42 -> 0.63      0.70      0.66
# seed = 0 ->
# seed = 