# THIS NOTEBOOK EXISTS ONLY FOR TESTING, NO REAL PURPOSE

In [1]:
# Set a seed value
seed_value= 42
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)
# 4. Set `pytorch` pseudo-random generator at a fixed value
import torch
torch.manual_seed(seed_value)
torch.backends.cudnn.deterministic = True

In [2]:
ENT = "Cellline"
DATASET = "cll"

In [3]:
data_path = "/sbksvol/gaurav/NER_data/"

In [4]:
import os
data_dir = os.path.join(data_path, ENT, DATASET)

## Prepare Data

In [5]:
# from run_ner import prepare_data, prepare_config_and_tokenizer
from data_utils import convert_tsv_to_txt, get_train_test_df

In [6]:
all_data = convert_tsv_to_txt(data_dir)
train_df, test_df = get_train_test_df(all_data)

In [8]:
# Print some data statistics
num_train_sents = len(all_data["train"]["words"])
num_dev_sents = len(all_data["dev"]["words"])
num_test_sents = len(all_data["test"]["words"])
print("num_train_sents, num_dev_sents, num_test_sents = ", num_train_sents, num_dev_sents, num_test_sents)


print("First 10 words in test data:")
print(test_df.head(10))


# a list that has all possible labels 
labels = np.sort(train_df['labels'].unique()).tolist()
label_map =  {i: label for i, label in enumerate(labels)}
num_labels = len(labels)
print("unique labels:", labels)

num_train_sents, num_dev_sents, num_test_sents =  121 22 61
First 10 words in test data:
   sentence_id       words labels
0            0          By      O
1            0    Northern      O
2            0        blot      O
3            0    analysis      O
4            0           ,      O
5            0         the      O
6            0  expression      O
7            0          of      O
8            0          IL      O
9            0           -      O
unique labels: ['B', 'I', 'O']


## Model Definition

In [10]:
from transformers import (
    BertConfig,
    BertTokenizer
)

model_args = dict()

# Path to pretrained model or model identifier from huggingface.co/models
model_args['model_name_or_path'] = 'dmis-lab/biobert-base-cased-v1.1'
# saved_model_path
# saved_model_path
# pytorch_dump_path
# 'dmis-lab/biobert-base-cased-v1.1'

# Where do you want to store the pretrained models downloaded from s3
model_args['cache_dir'] = "/sbksvol/gaurav/NER_out/"

# we skip basic white-space tokenization by passing do_basic_tokenize = False to the tokenizer
model_args['do_basic_tokenize'] = False


data_args = dict()

data_args['data_dir'] = data_dir

# "The maximum total input sequence length after tokenization. Sequences longer "
# "than this will be truncated, sequences shorter will be padded."
data_args['max_seq_length'] = 256

# Overwrite the cached training and evaluation sets
# this means the model does not have to tokenize/preprocess and cache the data each time it's called
# this can be made different for each NerDataset (training NerDataset, testing NerDataset)
data_args['overwrite_cache'] = True

config = BertConfig.from_pretrained(
    model_args['model_name_or_path'],
    num_labels=num_labels,
    id2label=label_map,
    label2id={label: i for i, label in enumerate(labels)},
    cache_dir=model_args['cache_dir']
)

# we skip basic white-space tokenization by passing do_basic_tokenize = False to the tokenizer
tokenizer = BertTokenizer.from_pretrained(
    model_args['model_name_or_path'],
    cache_dir=model_args['cache_dir']
#     ,do_basic_tokenize = model_args['do_basic_tokenize']
)

In [11]:
from utils_ner import NerDataset, Split

In [12]:
train_dataset = NerDataset(
  data_dir=data_args['data_dir'],
  tokenizer=tokenizer,
  labels=labels,
  model_type=config.model_type,
  max_seq_length=data_args['max_seq_length'],
  overwrite_cache=data_args['overwrite_cache'], # True
  mode=Split.train)

In [13]:
eval_dataset = NerDataset(
  data_dir=data_args['data_dir'],
  tokenizer=tokenizer,
  labels=labels,
  model_type=config.model_type,
  max_seq_length=data_args['max_seq_length'],
  overwrite_cache=data_args['overwrite_cache'],
  mode=Split.dev)

In [14]:
print(train_dataset.__len__(), eval_dataset.__len__())

120 21


## Train top-model using the Trainer API

In [15]:
from models import BertNERTopModel

In [17]:
top_model = {"name": "dense_layer_softmax", 
             "hidden_units_list": [500, 250, 125], 
             "activations_list": ["none", "none", "none", "none"]
            }


# ### First freeze bert weights and train

model = BertNERTopModel.from_pretrained(
    model_args['model_name_or_path'],
    config=config,
    cache_dir=model_args['cache_dir'],
    top_model=top_model
)

## base_model -> bert (excluding the classification layer)
for param in model.base_model.parameters():
    param.requires_grad = False


model.train()

training_args_dict = {
    'output_dir' : "model_output/",
    'num_train_epochs' : 20,
    'train_batch_size': 32,
    "save_strategy": "epoch",
    "evaluation_strategy": "epoch"
#     ,
#     "load_best_model_at_end": True
}

Initializing weights


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertNERTopModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertNERTopModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertNERTopModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertNERTopModel were not initialized from the model checkpoint at dmis-lab

### Create Trainer

In [18]:
from transformers.hf_argparser import HfArgumentParser
from transformers import TrainingArguments
from transformers import Trainer

In [20]:
import json

In [21]:
# ### Create Trainer
with open('training_args.json', 'w') as fp:
    json.dump(training_args_dict, fp)

parser = HfArgumentParser(TrainingArguments)
training_args = parser.parse_json_file(json_file="training_args.json")[0]


# ## Train

# Initialize the Trainer
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=eval_dataset
)

trainOutput = trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,1.007744,0.2473,84.921
2,No log,0.853826,0.2594,80.963
3,No log,0.621081,0.2545,82.512
4,No log,0.375918,0.2468,85.083
5,No log,0.218,0.2473,84.923
6,No log,0.162509,0.25,84.016
7,No log,0.143993,0.2504,83.871
8,No log,0.132813,0.249,84.345
9,No log,0.124075,0.2492,84.275
10,No log,0.11677,0.254,82.693


## Train

In [29]:
# ## Now reload the model from saved checkpoint

num_steps = trainOutput.global_step # 17880
checkpoint = f"checkpoint-{num_steps}"
top_model_path = f"{training_args_dict['output_dir']}/{checkpoint}" 

#### Config ####
config = BertConfig.from_pretrained(
    top_model_path,
    num_labels=num_labels,
    id2label=label_map,
    label2id={label: i for i, label in enumerate(labels)},
    cache_dir=model_args['cache_dir']
)

#### Model ####

reloaded_model = BertNERTopModel.from_pretrained(
    top_model_path,
    config=config,
    cache_dir=model_args['cache_dir'],
    top_model=top_model
)

Initializing weights


In [30]:
reloaded_model.top_model

{'name': 'dense_layer_softmax',
 'hidden_units_list': [500, 250, 125],
 'activations_list': ['none', 'none', 'none', 'none']}

In [31]:
#### Training args ####
training_args_dict = {
    'output_dir' : "model_output",
    'num_train_epochs' : 5,
    'train_batch_size': 32,
    'seed':int(42),
    "evaluation_strategy": "epoch"
#     ,"load_best_model_at_end": True
}

with open('training_args.json', 'w') as fp:
    json.dump(training_args_dict, fp)

parser = HfArgumentParser(TrainingArguments)
training_args = parser.parse_json_file(json_file="training_args.json")[0]


# ## Then unfreeze the bert weights and train end-to-end

model = reloaded_model

for param in model.base_model.parameters():
    param.requires_grad = True


model.to('cuda')
model.train()

BertNERTopModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [32]:
model.top_model

{'name': 'dense_layer_softmax',
 'hidden_units_list': [500, 250, 125],
 'activations_list': ['none', 'none', 'none', 'none']}

In [33]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=eval_dataset
)

# Begin training from the latest checkpoint
trainer.train(checkpoint)

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,0.047443,0.2502,83.927
2,No log,0.040522,0.2501,83.953
3,No log,0.038085,0.249,84.339
4,No log,0.03481,0.2583,81.288
5,No log,0.033017,0.2555,82.186


TrainOutput(global_step=75, training_loss=0.0750254758199056, metrics={'train_runtime': 24.5889, 'train_samples_per_second': 3.05, 'total_flos': 99773506252800.0, 'epoch': 5.0, 'init_mem_cpu_alloc_delta': 1374589, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 13096633, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 212238, 'train_mem_gpu_alloc_delta': 1308227584, 'train_mem_cpu_peaked_delta': 139932, 'train_mem_gpu_peaked_delta': 2274755584})

## Clean-up

In [34]:
import gc
gc.collect()
torch.cuda.empty_cache()

## Prepare test data

In [35]:
import numpy as np
from torch import nn

In [36]:
# we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt 
test_dataset = NerDataset(
  data_dir=data_args['data_dir'],
  tokenizer=tokenizer,
  labels=labels,
  model_type=config.model_type,
  max_seq_length=data_args['max_seq_length'],
  overwrite_cache=True,
  mode=Split.test)

# last layer output/activation has the shape of (batch_size, seq_len,num_of_labels)
output, label_ids, metrics = trainer.predict(test_dataset)
preds = np.argmax(output, axis=2)
batch_size, seq_len = preds.shape

# list of token-level predictions shape = (batch_size, seq_len)
preds_list = [[] for _ in range(batch_size)]
for i in range(batch_size):
    for j in range(seq_len):
        # ignore pad_tokens
        if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
            preds_list[i].append(label_map[preds[i][j]])

In [37]:
def sentences_combiner(df):
    # 'words' and 'labels' are the column names in the CSV file
    tupple_function = lambda x: [(w, t) for w, t in zip(x["words"].values.tolist(),
                                                      x["labels"].values.tolist())]
    grouped = df.groupby("sentence_id").apply(tupple_function)
    return [s for s in grouped]

testing_sentences = sentences_combiner(test_df)
test_labels = [[w[1] for w in s] for s in testing_sentences]
test_tokens = [[w[0] for w in s] for s in testing_sentences]

# reconstruct full sentences from lists of (token,label) tuples 
# test_reconstructed = [" ".join([w[0] for w in s] ) for s in testing_sentences]

In [38]:
# make sure all test and pred sentences have the same length

test_labels_new = []
preds_list_new = []

for i, x in enumerate(test_labels):
    if len(x) == len(preds_list[i]):
        test_labels_new.append(x)
        preds_list_new.append(preds_list[i])
    else:
        print("ABORT")

## Get entity level scores

In [39]:
from seqeval.metrics import f1_score, classification_report
print("F1-score: {:.1%}".format(f1_score(test_labels_new, preds_list_new)))
print(classification_report(test_labels_new, preds_list_new))

F1-score: 94.2%
              precision    recall  f1-score   support

           _       0.94      0.95      0.94        77

   micro avg       0.94      0.95      0.94        77
   macro avg       0.94      0.95      0.94        77
weighted avg       0.94      0.95      0.94        77



In [50]:
# 0.78      0.66      0.71 -> gellus, 3-layer +softmax with relu after each fc layer 
# 0.97 0.94 0.95 -> cll, 3-layer +softmax with relu after each fc layer except the 1st, seed=42
# 0.96 0.86, 0.90, 0.94, 0.96, 0.95 -> '' seed=0 
# 0.97, 0.91, 0.94 -> '' seed=13

**The following results are for the cll dataset with different # relu layers**

In [51]:
# relu after 1st, 2nd and 3rd layer -> 0, 0, 0
# relu after 2nd and 3rd layer -> 0.86, 0.96, 0.91
# relu only after 3rd layer -> 0.92, 0.99, 0.95
# No relu -> 0.97, 0.94, 0.95

**The following results are for 3 hidden layers without any relu**

In [52]:
# cll
# seed = 42 -> 0.92      0.99      0.95
# seed = 0  -> 0.95      0.97      0.96
# seed = 13 -> 0.97      0.96      0.97
# seed = 20 -> 0.90      0.95      0.92
# seed = 50 -> 0.96      0.96      0.96
# seed = 75 -> 0.92      0.95      0.94
# seed = 100 -> 0.93      0.92      0.93
# -----------------------
# average f1 -> 95

**The following results are for 1 hidden layers without any relu**

In [53]:
# cll
# seed = 42 -> Not obtained yet
# seed = 0  -> 0.83      0.99      0.90
# seed = 13 -> 0.90      0.99      0.94
# seed = 20 -> 0.90      0.97      0.94
# seed = 50 -> Not obtained yet
# seed = 75 -> Not obtained yet
# seed = 100 -> Not obtained yet
# -----------------------
# average f1 -> Not obtained yet


In [54]:
# cellfinder
# seed = 42 -> 0.86      0.63      0.73
# seed = 0  -> 0.84      0.77      0.80
# seed = 13 -> 0.83      0.70      0.76


In [1]:
import sys
print(sys.path)

['/sbksvol/gaurav/NER_src', '/root/anaconda3/lib/python37.zip', '/root/anaconda3/lib/python3.7', '/root/anaconda3/lib/python3.7/lib-dynload', '', '/root/anaconda3/lib/python3.7/site-packages', '/root/anaconda3/lib/python3.7/site-packages/IPython/extensions', '/root/.ipython']
