# Installation of libraries to be used

In [1]:
!pip uninstall -y torch
!pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html --upgrade
!pip install transformers
!pip install allennlp
!pip install flashtool
!pip install ray
!pip install pandas

Found existing installation: torch 2.1.0+cu118
Uninstalling torch-2.1.0+cu118:
  Successfully uninstalled torch-2.1.0+cu118
Looking in links: https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html
Collecting torch
  Downloading torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==

# Data Fetching from google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [3]:
!cp -r '/content/drive/MyDrive/nlpProjectNew/' './'

**Unlabeled data used for pretraining**

In [4]:
unlabeled_data = []
with open('/content/nlp_project/amazon-weak-ner-needle-main/bio_script/data/unlabeled_data/all_text.txt') as f:
  unlabeled_data = f.readlines()
length = len(unlabeled_data)

In [5]:
print("Length of unlabeled data used for pretraining : ",length)

Length of unlabeled data used for pretraining :  2258838


In [6]:
unlabeled_data[:10]

['Formate assay in body fluids: application in methanol poisoning.\n',
 'Delineation of the intimate details of the backbone conformation of pyridine nucleotide coenzymes in aqueous solution.\n',
 'Effect of chloroquine on cultured fibroblasts: release of lysosomal hydrolases and inhibition of their uptake.\n',
 'Metal substitutions incarbonic anhydrase: a halide ion probe study.\n',
 'Atomic models for the polypeptide backbones of myohemerythrin and hemerythrin.\n',
 'Studies of oxygen binding energy to hemoglobin molecule.\n',
 'Maturation of the adrenal medulla--IV. Effects of morphine.\n',
 'Comparison between procaine and isocarboxazid metabolism in vitro by a liver microsomal amidase-esterase.\n',
 'Radiochemical assay of glutathione S-epoxide transferase and its enhancement by phenobarbital in rat liver in vivo.\n',
 'Digitoxin metabolism by rat liver microsomes.\n']

In [7]:
training_data = unlabeled_data[:100000]
with open('/content/nlp_project/amazon-weak-ner-needle-main/bio_script/data/unlabeled_data/all_text_train.txt', 'w') as f:
    for line in training_data:
        f.write(f"{line}\n")

In [8]:
dev_data = unlabeled_data[100000:110000]
with open('/content/nlp_project/amazon-weak-ner-needle-main/bio_script/data/unlabeled_data/all_text_eval.txt', 'w') as f:
    for line in dev_data:
        f.write(f"{line}\n")

In [9]:
dev_data[0]

'Community health issue paramount as PAHO turns 75.\n'

# bert-base-uncased model

In [10]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [11]:
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional
import time
import torch

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForPermutationLanguageModeling,
    HfArgumentParser,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)


In [12]:
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [13]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
    """

    # The model checkpoint for weights initialization. Set to None for training from scratch.
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
        },
    )

    # If training from scratch, pass a model type from the list of supported model types.
    model_type: Optional[str] = field(
        default=None,
        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
    )

    # Pretrained config name or path if it's different from the model_name_or_path.
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )

    # Pretrained tokenizer name or path if it's different from the model_name_or_path.
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )

    # Where to store the pretrained models downloaded from external sources.
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

In [14]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    # The input training data file (a text file).
    train_data_file: Optional[str] = field(
        default=None, metadata={"help": "The input training data file (a text file)."}
    )

    # An optional input evaluation data file to evaluate the perplexity on (a text file).
    eval_data_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )

    # Whether distinct lines of text in the dataset are to be handled as distinct sequences.
    line_by_line: bool = field(
        default=False,
        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
    )

    # Train with masked-language modeling loss instead of language modeling.
    mlm: bool = field(
        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
    )

    # Ratio of tokens to mask for masked language modeling loss.
    mlm_probability: float = field(
        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
    )

    # Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling.
    plm_probability: float = field(
        default=1 / 6,
        metadata={
            "help": "Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling."
        },
    )

    # Maximum length of a span of masked tokens for permutation language modeling.
    max_span_length: int = field(
        default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
    )

    # Optional input sequence length after tokenization.
    # The training dataset will be truncated in blocks of this size for training.
    block_size: int = field(
        default=-1,
        metadata={
            "help": "Optional input sequence length after tokenization. "
            "The training dataset will be truncated in blocks of this size for training. "
            "Default to the model max input length for single sentence inputs (take into account special tokens)."
        },
    )

    # Overwrite the cached training and evaluation sets.
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )


In [15]:
class ParLineByLineTextDataset(LineByLineTextDataset):
    """
    This class is used to create a language modeling dataset for training models like BERT or RoBERTa.

    Args:
        tokenizer (PreTrainedTokenizer): The tokenizer for processing text data.
        file_path (str): The path to the text file used for dataset creation.
        block_size (int): The maximum length of a text block after tokenization.
    """

    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
        # Check if the input file exists; raise an error if not found
        assert os.path.isfile(file_path), f"Input file path {file_path} not found"

        # Read and process the file content
        st_time = time.time()
        with open(file_path, encoding="utf-8") as f:
            # Read lines from the file, filter out empty and whitespace-only lines
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        print(time.time() - st_time)

        # Initialize attributes for the dataset
        self.examples = [None] * len(lines)
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.lines = lines

    def __getitem__(self, i) -> torch.Tensor:
        # Check if the example at index i has been processed; if not, tokenize it
        if self.examples[i] is None:
            # Tokenize the line, add special tokens, and truncate it if necessary
            self.examples[i] = self.tokenizer(self.lines[i], add_special_tokens=True, truncation=True, max_length=self.block_size)["input_ids"]
        return torch.tensor(self.examples[i], dtype=torch.long)


In [16]:
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
    # Determine the file path based on whether the function is used for evaluation or training.
    file_path = args.eval_data_file if evaluate else args.train_data_file
    print("file_path in get_dataset =", file_path)

    # Check if the dataset should be processed line by line.
    if args.line_by_line:
        print('inside get_dataset line_by_line')
        # Create a ParLineByLineTextDataset when processing line by line.
        return ParLineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        print('inside get_dataset else statement')
        # Create a TextDataset when not processing line by line.
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )


In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device=",device)
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
print('ModelArguments=',ModelArguments)
print('DataTrainingArguments=',DataTrainingArguments)
print('TrainingArguments=',TrainingArguments)
print('parser=',parser)

device= cuda:0
ModelArguments= <class '__main__.ModelArguments'>
DataTrainingArguments= <class '__main__.DataTrainingArguments'>
TrainingArguments= <class 'transformers.training_args.TrainingArguments'>
parser= HfArgumentParser(prog='colab_kernel_launcher.py', usage=None, description=None, formatter_class=<class 'argparse.ArgumentDefaultsHelpFormatter'>, conflict_handler='error', add_help=True)


In [18]:
config = AutoConfig.from_pretrained('bert-base-uncased',cache_dir=None)
config

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [20]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased',cache_dir=None)
print("tokenizer1=",tokenizer)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

tokenizer1= PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [21]:
model = AutoModelWithLMHead.from_pretrained('bert-base-uncased',from_tf=bool(".ckpt" in 'bert-base-uncased'),config=config,cache_dir=None)
# print('model=',model)
print("len(tokeniser)=",len(tokenizer))
model.resize_token_embeddings(len(tokenizer))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


len(tokeniser)= 30522


Embedding(30522, 768, padding_idx=0)

In [22]:
train_data_file='/content/nlp_project/amazon-weak-ner-needle-main/bio_script/data/unlabeled_data/all_text_train.txt'

In [23]:
eval_data_file='/content/nlp_project/amazon-weak-ner-needle-main/bio_script/data/unlabeled_data/all_text_eval.txt'

In [24]:
data_args = DataTrainingArguments(
    train_data_file=train_data_file,    # Training data file path
    eval_data_file=eval_data_file,      # Evaluation data file path
    line_by_line=True,                 # Process data line by line
    mlm=True,                          # Train with masked-language modeling loss
    mlm_probability=0.15,              # Ratio of tokens to mask for masked language modeling loss
    plm_probability=0.16666666666666666,  # Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling
    max_span_length=5,                # Maximum length of a span of masked tokens for permutation language modeling
    block_size=64,                    # Optional input sequence length after tokenization
    overwrite_cache=False              # Do not overwrite the cached training and evaluation sets
)


In [25]:
train_dataset = get_dataset(data_args, tokenizer=tokenizer)

file_path in get_dataset = /content/nlp_project/amazon-weak-ner-needle-main/bio_script/data/unlabeled_data/all_text_train.txt
inside get_dataset line_by_line
0.7899446487426758


In [26]:
eval_dataset = get_dataset(data_args, tokenizer=tokenizer,evaluate=True)

file_path in get_dataset = /content/nlp_project/amazon-weak-ner-needle-main/bio_script/data/unlabeled_data/all_text_eval.txt
inside get_dataset line_by_line
0.05732464790344238


In [27]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability)
print("data_collator=",data_collator)

data_collator= DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), mlm=True, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')


In [28]:
training_args = TrainingArguments(
    output_dir='./output',  # Output directory for model checkpoints and logs
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy='steps',
    save_steps=30000,
    save_total_limit=2,
    report_to=None
)

In [29]:
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset
# prediction_loss_only=True,
)
print("trainer=",trainer)

trainer= <transformers.trainer.Trainer object at 0x7ceeb2f205e0>


In [30]:
import os

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 100000
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 4689
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Step,Training Loss,Validation Loss
500,2.1748,1.972207
1000,1.9958,1.910724
1500,1.8952,1.813702
2000,1.7965,1.783881
2500,1.7732,1.734854
3000,1.7294,1.724138
3500,1.6808,1.673099
4000,1.6652,1.697249
4500,1.6548,1.661559


***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./output
Configuration saved in ./output/config.json
Model weights saved in ./output/pytorch_model.bin


In [31]:
!cp '/content/output/pytorch_model.bin' '/content/drive/MyDrive'

In [32]:
!cp '/content/output/config.json' '/content/drive/MyDrive'